From 92bfd8a82dada146254f0f34aa531d1af61ae695 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 18:24:24 +0100
Subject: [PATCH 01/55] Initial AVX-512 support. Adds level detection, mask
 operations, and dedicated AVX-512 implementations for complex int/float
 vector operations that benefit the most.

LLM summary of the changes:

Implemented:
- Added `X86::Avx512` in the generator with Ice Lake feature set, `native_width = 512`, `max_block_size = 512`.
- Generated new `fearless_simd/src/generated/avx512.rs`.
- Wired public API: `Avx512`, `x86::Avx512`, `Level::Avx512`, `Level::as_avx512`, dispatch, and `kernel!` support.
- Updated runtime/static detection so Ice Lake AVX-512 is selected before AVX2, while `as_avx2()` and `as_sse4_2()` downgrade correctly.
- Bumped MSRV/docs/CI/check-target metadata to Rust 1.89.

Generator/backend behavior:
- 512-bit vectors use native `__m512`, `__m512d`, and `__m512i`.
- AVX-512 masks now use raw compact `__mmask8/16/32/64` storage, with no aligned wrapper.
- Generic `SimdFrom<__mmask*, S>` / `From<mask*, __mmask*>` now route through `from_bitmask` / `to_bitmask`, so they are correct for non-AVX-512 `S` too.
- Added AVX-512 compare/select paths using mask-returning compares and mask blends.
- Added direct conversion paths, including `f32 <-> i32/u32` and `u8 <-> u16`.
- Added AVX-512 vector slides for vectors only; masks intentionally have no slide support.
- Added dedicated AVX-512 zip/unzip/interleave/deinterleave using `permutex2var`, especially for 256/512-bit widths.

Tests/coverage:
- Extended `#[simd_test]` to include AVX-512.
- Added AVX-512 detection/dispatch coverage.
- Updated mask bitwise tests for canonical boolean mask lanes.
- Added a regression test that AVX-512 mask public types are compact and match `__mmask*` sizes.
---
 .github/workflows/ci.yml                  |   7 +-
 CHANGELOG.md                              |  10 +-
 Cargo.toml                                |   2 +-
 README.md                                 |   2 +-
 check_targets.sh                          |   2 +
 fearless_simd/README.md                   |   2 +-
 fearless_simd/src/generated.rs            |   4 +
 fearless_simd/src/generated/avx2.rs       |  36 +-
 fearless_simd/src/generated/simd_trait.rs |  10 +-
 fearless_simd/src/generated/simd_types.rs |   6 +-
 fearless_simd/src/generated/sse4_2.rs     |  36 +-
 fearless_simd/src/kernel_macros.rs        |  56 +-
 fearless_simd/src/lib.rs                  | 134 +++-
 fearless_simd/src/macros.rs               |   9 +
 fearless_simd_dev_macros/src/lib.rs       |  53 ++
 fearless_simd_gen/src/level.rs            |  58 +-
 fearless_simd_gen/src/main.rs             |   4 +
 fearless_simd_gen/src/mk_simd_trait.rs    |   4 +-
 fearless_simd_gen/src/mk_x86.rs           | 867 +++++++++++++++++++++-
 fearless_simd_gen/src/ops.rs              |   2 +-
 fearless_simd_tests/tests/harness/mod.rs  |  42 +-
 fearless_simd_tests/tests/mod.rs          | 122 ++-
 22 files changed, 1366 insertions(+), 102 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 01cd78a21..906b886c5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,12 +3,12 @@ env:
   # version like 1.70. Note that we only specify MAJOR.MINOR and not PATCH so that bugfixes still
   # come automatically. If the version specified here is no longer the latest stable version,
   # then please feel free to submit a PR that adjusts it along with the potential clippy fixes.
-  RUST_STABLE_VER: "1.88" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7
+  RUST_STABLE_VER: "1.89" # In quotes because otherwise (e.g.) 1.70 would be interpreted as 1.7
   # The purpose of checking with the minimum supported Rust toolchain is to detect its staleness.
   # If the compilation fails, then the version specified here needs to be bumped up to reality.
   # Be sure to also update the rust-version property in the workspace Cargo.toml file,
   # plus all the README.md files of the affected packages.
-  RUST_MIN_VER: "1.88"
+  RUST_MIN_VER: "1.89"
   # List of packages that will be checked with the minimum supported Rust version.
   # This should be limited to packages that are intended for publishing.
   RUST_MIN_VER_PKGS: "-p fearless_simd"
@@ -268,8 +268,7 @@ jobs:
       - name: run tests on CPU with AVX-512
         # Github Actions doesn't give us AVX-512 so this is the only way to exercise AVX-512 codepaths on CI.
         # -icl stands for Ice Lake. Technically Skylake added AVX-512 first, but it's mostly useless there due to
-        # downclocking. When we do eventually add explicit AVX-512 support, we'll likely target the Ice Lake feature
-        # level.
+        # downclocking, so our explicit AVX-512 level targets Ice Lake.
         run: ${SDE_PKG}/sde64 -icl -- cargo test $CARGO_TEST_ARGS
 
   test-aarch64-qemu:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index facb8b857..7638028ac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,15 @@ You can find its changes [documented below](#041-2026-05-16).
 
 ## [Unreleased]
 
-This release has an [MSRV][] of 1.88.
+This release has an [MSRV][] of 1.89.
+
+### Added
+
+- Added Ice Lake-class AVX-512 support with a generated `Avx512` level and 512-bit native-width vector types.
+
+### Changed
+
+- The MSRV is now Rust 1.89.
 
 ## [0.4.1][] (2026-05-16)
 
diff --git a/Cargo.toml b/Cargo.toml
index 0158a30a3..615ede613 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ license = "Apache-2.0 OR MIT"
 repository = "https://github.com/linebender/fearless_simd"
 # Keep in sync with RUST_MIN_VER in .github/workflows/ci.yml, with the relevant README.md files
 # and with the MSRV in the `Unreleased` section of CHANGELOG.md.
-rust-version = "1.88"
+rust-version = "1.89"
 
 [workspace.lints]
 
diff --git a/README.md b/README.md
index 3e7243a11..b94d5beb5 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ It benefited from conversations with Luca Versari, though he is not responsible
 
 ## Minimum supported Rust Version (MSRV)
 
-This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later.
+This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later.
 
 Future versions of Fearless SIMD might increase the Rust version requirement.
 It will not be treated as a breaking change and as such can even happen with small patch releases.
diff --git a/check_targets.sh b/check_targets.sh
index 90b09fb7f..98e61c22c 100644
--- a/check_targets.sh
+++ b/check_targets.sh
@@ -15,6 +15,8 @@ cargo check -p fearless_simd --target aarch64-linux-android  --features force_su
 cargo check -p fearless_simd --target aarch64-linux-android
 
 # x86_64, at all supported static SIMD levels.
+RUSTFLAGS=-Ctarget-cpu=icelake-server cargo check -p fearless_simd --target x86_64-unknown-linux-gnu
+RUSTFLAGS=-Ctarget-cpu=icelake-server cargo check -p fearless_simd --target x86_64-unknown-linux-gnu --features force_support_fallback
 RUSTFLAGS=-Ctarget-feature=+avx2,+fma cargo check -p fearless_simd --target x86_64-unknown-linux-gnu
 RUSTFLAGS=-Ctarget-feature=+avx2,+fma cargo check -p fearless_simd --target x86_64-unknown-linux-gnu --features force_support_fallback
 RUSTFLAGS=-Ctarget-feature=+sse4.2 cargo check -p fearless_simd --target x86_64-unknown-linux-gnu
diff --git a/fearless_simd/README.md b/fearless_simd/README.md
index 22da184a3..1c4c4410a 100644
--- a/fearless_simd/README.md
+++ b/fearless_simd/README.md
@@ -168,7 +168,7 @@ At least one of `std` and `libm` is required; `std` overrides `libm`.
 
 ## Minimum supported Rust Version (MSRV)
 
-This version of Fearless SIMD has been verified to compile with **Rust 1.88** and later.
+This version of Fearless SIMD has been verified to compile with **Rust 1.89** and later.
 
 Future versions of Fearless SIMD might increase the Rust version requirement.
 It will not be treated as a breaking change and as such can even happen with small patch releases.
diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs
index 0fe782230..aa47e1588 100644
--- a/fearless_simd/src/generated.rs
+++ b/fearless_simd/src/generated.rs
@@ -47,6 +47,8 @@
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod avx2;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod avx512;
 mod fallback;
 #[cfg(target_arch = "aarch64")]
 mod neon;
@@ -60,6 +62,8 @@ mod wasm;
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub use avx2::*;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+pub use avx512::*;
 pub use fallback::*;
 #[cfg(target_arch = "aarch64")]
 pub use neon::*;
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 742a82f6b..49b609b6b 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -8353,16 +8353,15 @@ impl<S: Simd> From<u8x32<S>> for __m256i {
 impl<S: Simd> SimdFrom<__m256i, S> for mask8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i8; 32usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask8x32<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask8x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i8; 32usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for i16x16<S> {
@@ -8398,16 +8397,15 @@ impl<S: Simd> From<u16x16<S>> for __m256i {
 impl<S: Simd> SimdFrom<__m256i, S> for mask16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i16; 16usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask16x16<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask16x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i16; 16usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for i32x8<S> {
@@ -8443,16 +8441,15 @@ impl<S: Simd> From<u32x8<S>> for __m256i {
 impl<S: Simd> SimdFrom<__m256i, S> for mask32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i32; 8usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask32x8<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i32; 8usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m256d, S> for f64x4<S> {
@@ -8473,16 +8470,15 @@ impl<S: Simd> From<f64x4<S>> for __m256d {
 impl<S: Simd> SimdFrom<__m256i, S> for mask64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i64; 4usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask64x4<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask64x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i64; 4usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 7f6eb74ce..4bde9b4e3 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -27,8 +27,8 @@ use crate::{
 #[doc = r" # Associated Types"]
 #[doc = r""]
 #[doc = r#" The trait defines associated types for the highest "native" vector width of each scalar type (e.g. `f32s`,"#]
-#[doc = r" `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits everywhere but"]
-#[doc = r" AVX2, where they are 256 bits."]
+#[doc = r" `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits on the"]
+#[doc = r" fallback, NEON, WASM, and SSE4.2 backends, 256 bits on AVX2, and 512 bits on AVX-512."]
 #[doc = r""]
 #[doc = r" # Example"]
 #[doc = r""]
@@ -218,7 +218,7 @@ pub trait Simd:
     fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self>;
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self>;
     #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."]
     fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self>;
@@ -1070,7 +1070,7 @@ pub trait Simd:
     fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self>;
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self>;
     #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."]
     fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self>;
@@ -1948,7 +1948,7 @@ pub trait Simd:
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
     #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values are saturated to the closest in-range value. NaN becomes 0."]
     fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 416defc26..335490fd6 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -1572,7 +1572,7 @@ impl<S: Simd> crate::SimdInt<S> for u32x4<S> {
     }
 }
 impl<S: Simd> SimdCvtTruncate<f32x4<S>> for u32x4<S> {
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     #[inline(always)]
     fn truncate_from(x: f32x4<S>) -> Self {
         x.simd.cvt_u32_f32x4(x)
@@ -3644,7 +3644,7 @@ impl<S: Simd> crate::SimdInt<S> for u32x8<S> {
     }
 }
 impl<S: Simd> SimdCvtTruncate<f32x8<S>> for u32x8<S> {
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     #[inline(always)]
     fn truncate_from(x: f32x8<S>) -> Self {
         x.simd.cvt_u32_f32x8(x)
@@ -5713,7 +5713,7 @@ impl<S: Simd> crate::SimdInt<S> for u32x16<S> {
     }
 }
 impl<S: Simd> SimdCvtTruncate<f32x16<S>> for u32x16<S> {
-    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
+    #[doc = "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\nOut-of-range values or NaN will produce implementation-defined results.\n\nOn x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\nIf you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards."]
     #[inline(always)]
     fn truncate_from(x: f32x16<S>) -> Self {
         x.simd.cvt_u32_f32x16(x)
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index caa490558..d55aa6a44 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -8403,16 +8403,15 @@ impl<S: Simd> From<u8x16<S>> for __m128i {
 impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i8; 16usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask8x16<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i8; 16usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
@@ -8448,16 +8447,15 @@ impl<S: Simd> From<u16x8<S>> for __m128i {
 impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i16; 8usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask16x8<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i16; 8usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
@@ -8493,16 +8491,15 @@ impl<S: Simd> From<u32x4<S>> for __m128i {
 impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i32; 4usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask32x4<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i32; 4usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
@@ -8523,16 +8520,15 @@ impl<S: Simd> From<f64x2<S>> for __m128d {
 impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
-            simd,
-        }
+        let lanes: [i64; 2usize] = unsafe { core::mem::transmute_copy(&arch) };
+        lanes.simd_into(simd)
     }
 }
 impl<S: Simd> From<mask64x2<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask64x2<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        let lanes: [i64; 2usize] = value.into();
+        unsafe { core::mem::transmute_copy(&lanes) }
     }
 }
 #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs
index c713657b9..f6695c258 100644
--- a/fearless_simd/src/kernel_macros.rs
+++ b/fearless_simd/src/kernel_macros.rs
@@ -8,7 +8,7 @@
 /// use platform-specific intrinsics for parts of the computation.
 ///
 /// The first argument must be a SIMD token written as `token: Neon`,
-/// `token: WasmSimd128`, `token: Sse4_2`, or `token: Avx2`.
+/// `token: WasmSimd128`, `token: Sse4_2`, `token: Avx2`, or `token: Avx512`.
 ///
 /// For levels with runtime-detected target features, the macro runs your body
 /// inside an inner function annotated with the appropriate `#[target_feature]`
@@ -54,7 +54,7 @@
 /// However, the body of the function can be as complex as you like.
 ///
 /// The SIMD token type must be written as a bare supported name:
-/// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases.
+/// literally `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`. No paths or aliases.
 ///
 /// For soundness, this macro only accepts safe functions.
 ///
@@ -93,7 +93,7 @@ macro_rules! kernel {
     ) => {
         compile_error!(concat!(
             "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
-            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`; got `",
             stringify!($token_ty),
             "`",
         ));
@@ -153,13 +153,27 @@ macro_rules! __fearless_simd_kernel_dispatch {
         }
     };
 
+    (
+        Avx512,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Avx512;
+            @kernel_attrs #[target_feature(
+                enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves"
+            )];
+            $($body)*
+        }
+    };
+
     (
         $token_ty:ident,
         $($body:tt)*
     ) => {
         compile_error!(concat!(
             "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
-            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, `Avx2`, or `Avx512`; got `",
             stringify!($token_ty),
             "`",
         ));
@@ -216,9 +230,9 @@ mod tests {
     #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
     use core::arch::wasm32::{f32x4_add, v128};
     #[cfg(target_arch = "x86")]
-    use core::arch::x86::{__m256i, _mm256_add_epi32};
+    use core::arch::x86::{__m256i, __m512i, _mm256_add_epi32, _mm512_add_epi32};
     #[cfg(target_arch = "x86_64")]
-    use core::arch::x86_64::{__m256i, _mm256_add_epi32};
+    use core::arch::x86_64::{__m256i, __m512i, _mm256_add_epi32, _mm512_add_epi32};
 
     crate::kernel! {
         fn add_f32x4_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
@@ -238,6 +252,12 @@ mod tests {
         }
     }
 
+    crate::kernel! {
+        fn add_i32x16_avx512(avx512: Avx512, a: __m512i, b: __m512i) -> __m512i {
+            _mm512_add_epi32(a, b)
+        }
+    }
+
     #[cfg(target_arch = "aarch64")]
     #[test]
     fn kernel_instantiates_for_neon() {
@@ -291,4 +311,28 @@ mod tests {
             "`kernel!` should instantiate a working AVX2 kernel"
         );
     }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[test]
+    fn kernel_instantiates_for_avx512() {
+        let Some(avx512) = crate::Level::new().as_avx512() else {
+            return;
+        };
+
+        let a: crate::i32x16<_> =
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16].simd_into(avx512);
+        let b: crate::i32x16<_> = [
+            10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160,
+        ]
+        .simd_into(avx512);
+        let sum: crate::i32x16<_> = add_i32x16_avx512(avx512, a.into(), b.into()).simd_into(avx512);
+
+        assert_eq!(
+            <[i32; 16]>::from(sum),
+            [
+                11, 22, 33, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165, 176
+            ],
+            "`kernel!` should instantiate a working AVX-512 kernel"
+        );
+    }
 }
diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs
index 84e91269e..fa63d7b84 100644
--- a/fearless_simd/src/lib.rs
+++ b/fearless_simd/src/lib.rs
@@ -182,9 +182,46 @@ pub mod wasm32 {
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub mod x86 {
     pub use crate::generated::Avx2;
+    pub use crate::generated::Avx512;
     pub use crate::generated::Sse4_2;
 }
 
+#[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
+#[inline]
+fn x86_detects_icelake_avx512() -> bool {
+    std::arch::is_x86_feature_detected!("adx")
+        && std::arch::is_x86_feature_detected!("aes")
+        && std::arch::is_x86_feature_detected!("avx512bitalg")
+        && std::arch::is_x86_feature_detected!("avx512bw")
+        && std::arch::is_x86_feature_detected!("avx512cd")
+        && std::arch::is_x86_feature_detected!("avx512dq")
+        && std::arch::is_x86_feature_detected!("avx512f")
+        && std::arch::is_x86_feature_detected!("avx512ifma")
+        && std::arch::is_x86_feature_detected!("avx512vbmi")
+        && std::arch::is_x86_feature_detected!("avx512vbmi2")
+        && std::arch::is_x86_feature_detected!("avx512vl")
+        && std::arch::is_x86_feature_detected!("avx512vnni")
+        && std::arch::is_x86_feature_detected!("avx512vpopcntdq")
+        && std::arch::is_x86_feature_detected!("bmi1")
+        && std::arch::is_x86_feature_detected!("bmi2")
+        && std::arch::is_x86_feature_detected!("cmpxchg16b")
+        && std::arch::is_x86_feature_detected!("fma")
+        && std::arch::is_x86_feature_detected!("gfni")
+        && std::arch::is_x86_feature_detected!("lzcnt")
+        && std::arch::is_x86_feature_detected!("movbe")
+        && std::arch::is_x86_feature_detected!("pclmulqdq")
+        && std::arch::is_x86_feature_detected!("popcnt")
+        && std::arch::is_x86_feature_detected!("rdrand")
+        && std::arch::is_x86_feature_detected!("rdseed")
+        && std::arch::is_x86_feature_detected!("sha")
+        && std::arch::is_x86_feature_detected!("vaes")
+        && std::arch::is_x86_feature_detected!("vpclmulqdq")
+        && std::arch::is_x86_feature_detected!("xsave")
+        && std::arch::is_x86_feature_detected!("xsavec")
+        && std::arch::is_x86_feature_detected!("xsaveopt")
+        && std::arch::is_x86_feature_detected!("xsaves")
+}
+
 /// The level enum with the specific SIMD capabilities available.
 ///
 /// The contained values serve as a proof that the associated target
@@ -246,6 +283,9 @@ pub enum Level {
         ))
     ))]
     Sse4_2(Sse4_2),
+    /// Ice Lake-class AVX-512 on (32 and 64 bit) x86.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    Avx512(Avx512),
     /// The x86-64-v3 instruction set on (32 and 64 bit) x86, including AVX2 and FMA.
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     Avx2(Avx2),
@@ -297,6 +337,10 @@ impl Level {
         }
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
+            if x86_detects_icelake_avx512() {
+                return unsafe { Self::Avx512(Avx512::new_unchecked()) };
+            }
+
             // Feature list sourced from `rustc --print=cfg --target x86_64-unknown-linux-gnu -C target-cpu=x86-64-v3`
             // However, the following features are implied by avx2 and do not need to be spelled out:
             // avx,fxsr,sse,sse2,sse3,sse4.1,sse4.2,ssse3
@@ -470,6 +514,9 @@ impl Level {
     #[inline]
     pub fn as_sse4_2(self) -> Option<Sse4_2> {
         match self {
+            // Safety: The Avx512 struct represents an Ice Lake feature set, which includes the
+            // `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2.
+            Self::Avx512(_avx512) => unsafe { Some(Sse4_2::new_unchecked()) },
             // Safety: The Avx2 struct represents the x86-64-v3 feature set being enabled, which
             // includes the `sse4.2`, `cmpxchg16b`, and `popcnt` features required by Sse4_2.
             Self::Avx2(_avx) => unsafe { Some(Sse4_2::new_unchecked()) },
@@ -513,11 +560,29 @@ impl Level {
             reason = "On machines which statically support `avx2`, there is only one variant."
         )]
         match self {
+            // Safety: The Ice Lake AVX-512 feature set includes the x86-64-v3 features required by Avx2.
+            Self::Avx512(_avx512) => unsafe { Some(Avx2::new_unchecked()) },
             Self::Avx2(avx2) => Some(avx2),
             _ => None,
         }
     }
 
+    /// If this is a proof that the Ice Lake AVX-512 feature set is available, access that
+    /// instruction set.
+    ///
+    /// See [`Avx512::new_unchecked`] for the exact list of CPU features this token enables.
+    ///
+    /// This can be used in combination with the [kernel] macro to safely access level-specific
+    /// SIMD intrinsics.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[inline]
+    pub fn as_avx512(self) -> Option<Avx512> {
+        match self {
+            Self::Avx512(avx512) => Some(avx512),
+            _ => None,
+        }
+    }
+
     /// Get the strongest statically supported SIMD level.
     ///
     /// That is, if your compilation run ambiently declares that a target feature is enabled,
@@ -560,6 +625,40 @@ impl Level {
         }
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
+            #[cfg(all(
+                target_feature = "adx",
+                target_feature = "aes",
+                target_feature = "avx512bitalg",
+                target_feature = "avx512bw",
+                target_feature = "avx512cd",
+                target_feature = "avx512dq",
+                target_feature = "avx512f",
+                target_feature = "avx512ifma",
+                target_feature = "avx512vbmi",
+                target_feature = "avx512vbmi2",
+                target_feature = "avx512vl",
+                target_feature = "avx512vnni",
+                target_feature = "avx512vpopcntdq",
+                target_feature = "bmi1",
+                target_feature = "bmi2",
+                target_feature = "cmpxchg16b",
+                target_feature = "fma",
+                target_feature = "gfni",
+                target_feature = "lzcnt",
+                target_feature = "movbe",
+                target_feature = "pclmulqdq",
+                target_feature = "popcnt",
+                target_feature = "rdrand",
+                target_feature = "rdseed",
+                target_feature = "sha",
+                target_feature = "vaes",
+                target_feature = "vpclmulqdq",
+                target_feature = "xsave",
+                target_feature = "xsavec",
+                target_feature = "xsaveopt",
+                target_feature = "xsaves"
+            ))]
+            return unsafe { Self::Avx512(Avx512::new_unchecked()) };
             #[cfg(all(
                 target_feature = "avx2",
                 target_feature = "bmi1",
@@ -570,7 +669,40 @@ impl Level {
                 target_feature = "lzcnt",
                 target_feature = "movbe",
                 target_feature = "popcnt",
-                target_feature = "xsave"
+                target_feature = "xsave",
+                not(all(
+                    target_feature = "adx",
+                    target_feature = "aes",
+                    target_feature = "avx512bitalg",
+                    target_feature = "avx512bw",
+                    target_feature = "avx512cd",
+                    target_feature = "avx512dq",
+                    target_feature = "avx512f",
+                    target_feature = "avx512ifma",
+                    target_feature = "avx512vbmi",
+                    target_feature = "avx512vbmi2",
+                    target_feature = "avx512vl",
+                    target_feature = "avx512vnni",
+                    target_feature = "avx512vpopcntdq",
+                    target_feature = "bmi1",
+                    target_feature = "bmi2",
+                    target_feature = "cmpxchg16b",
+                    target_feature = "fma",
+                    target_feature = "gfni",
+                    target_feature = "lzcnt",
+                    target_feature = "movbe",
+                    target_feature = "pclmulqdq",
+                    target_feature = "popcnt",
+                    target_feature = "rdrand",
+                    target_feature = "rdseed",
+                    target_feature = "sha",
+                    target_feature = "vaes",
+                    target_feature = "vpclmulqdq",
+                    target_feature = "xsave",
+                    target_feature = "xsavec",
+                    target_feature = "xsaveopt",
+                    target_feature = "xsaves"
+                ))
             ))]
             return unsafe { Self::Avx2(Avx2::new_unchecked()) };
             #[cfg(all(
diff --git a/fearless_simd/src/macros.rs b/fearless_simd/src/macros.rs
index 346913862..be73bd6d1 100644
--- a/fearless_simd/src/macros.rs
+++ b/fearless_simd/src/macros.rs
@@ -103,6 +103,15 @@ macro_rules! dispatch {
                 )
             }
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            $crate::Level::Avx512(avx512) => {
+                let $simd = launder(avx512);
+                $crate::Simd::vectorize(
+                    avx512,
+                    #[inline(always)]
+                    || $op,
+                )
+            }
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             $crate::Level::Avx2(avx2) => {
                 let $simd = launder(avx2);
                 $crate::Simd::vectorize(
diff --git a/fearless_simd_dev_macros/src/lib.rs b/fearless_simd_dev_macros/src/lib.rs
index 438632cb9..78b301110 100644
--- a/fearless_simd_dev_macros/src/lib.rs
+++ b/fearless_simd_dev_macros/src/lib.rs
@@ -21,6 +21,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
     let neon_name = get_ident("neon");
     let sse4_name = get_ident("sse4");
     let avx2_name = get_ident("avx2");
+    let avx512_name = get_ident("avx512");
     let wasm_name = get_ident("wasm");
 
     let ignore_attr = |f: fn(&str) -> bool| {
@@ -40,6 +41,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
     let ignore_neon = ignore_attr(exclude_neon);
     let ignore_sse4 = ignore_attr(exclude_sse4);
     let ignore_avx2 = ignore_attr(exclude_avx2);
+    let ignore_avx512 = ignore_attr(exclude_avx512);
     let ignore_wasm = ignore_attr(exclude_wasm);
 
     let fallback_snippet = quote! {
@@ -116,6 +118,52 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
         }
     };
 
+    let avx512_snippet = quote! {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        #[test]
+        #ignore_avx512
+        fn #avx512_name() {
+            if std::arch::is_x86_feature_detected!("adx")
+                && std::arch::is_x86_feature_detected!("aes")
+                && std::arch::is_x86_feature_detected!("avx512bitalg")
+                && std::arch::is_x86_feature_detected!("avx512bw")
+                && std::arch::is_x86_feature_detected!("avx512cd")
+                && std::arch::is_x86_feature_detected!("avx512dq")
+                && std::arch::is_x86_feature_detected!("avx512f")
+                && std::arch::is_x86_feature_detected!("avx512ifma")
+                && std::arch::is_x86_feature_detected!("avx512vbmi")
+                && std::arch::is_x86_feature_detected!("avx512vbmi2")
+                && std::arch::is_x86_feature_detected!("avx512vl")
+                && std::arch::is_x86_feature_detected!("avx512vnni")
+                && std::arch::is_x86_feature_detected!("avx512vpopcntdq")
+                && std::arch::is_x86_feature_detected!("bmi1")
+                && std::arch::is_x86_feature_detected!("bmi2")
+                && std::arch::is_x86_feature_detected!("cmpxchg16b")
+                && std::arch::is_x86_feature_detected!("fma")
+                && std::arch::is_x86_feature_detected!("gfni")
+                && std::arch::is_x86_feature_detected!("lzcnt")
+                && std::arch::is_x86_feature_detected!("movbe")
+                && std::arch::is_x86_feature_detected!("pclmulqdq")
+                && std::arch::is_x86_feature_detected!("popcnt")
+                && std::arch::is_x86_feature_detected!("rdrand")
+                && std::arch::is_x86_feature_detected!("rdseed")
+                && std::arch::is_x86_feature_detected!("sha")
+                && std::arch::is_x86_feature_detected!("vaes")
+                && std::arch::is_x86_feature_detected!("vpclmulqdq")
+                && std::arch::is_x86_feature_detected!("xsave")
+                && std::arch::is_x86_feature_detected!("xsavec")
+                && std::arch::is_x86_feature_detected!("xsaveopt")
+                && std::arch::is_x86_feature_detected!("xsaves")
+            {
+                let avx512 = unsafe { fearless_simd::x86::Avx512::new_unchecked() };
+                avx512.vectorize(
+                    #[inline(always)]
+                    || #input_fn_name(avx512)
+                );
+            }
+        }
+    };
+
     let wasm_snippet = quote! {
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
         #[test]
@@ -135,6 +183,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
         #wasm_snippet
         #sse4_snippet
         #avx2_snippet
+        #avx512_snippet
     }
     .into()
 }
@@ -158,6 +207,10 @@ fn exclude_avx2(_test_name: &str) -> bool {
     false
 }
 
+fn exclude_avx512(_test_name: &str) -> bool {
+    false
+}
+
 fn exclude_wasm(_test_name: &str) -> bool {
     false
 }
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index 61ec20303..8022eb0e4 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -34,6 +34,13 @@ pub(crate) trait Level {
     /// type *larger* than [`Level::max_block_size`], since [`VecType::aligned_wrapper_ty`] will split those up into
     /// smaller blocks.
     fn arch_ty(&self, vec_ty: &VecType) -> TokenStream;
+    /// The associated storage type used by a public SIMD vector for this level.
+    ///
+    /// Most levels wrap their native storage in an `Aligned*` newtype, but some compact scalar-like
+    /// representations, such as AVX-512 masks, can store the native type directly.
+    fn arch_storage_ty(&self, vec_ty: &VecType) -> TokenStream {
+        vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size())
+    }
     /// The docstring for this SIMD level token.
     fn token_doc(&self) -> &'static str;
     /// Any additional imports or supporting code necessary for the module (for instance, importing
@@ -59,8 +66,7 @@ pub(crate) trait Level {
         let mut assoc_types = vec![];
         for vec_ty in SIMD_TYPES {
             let ty_ident = vec_ty.rust();
-            let wrapper_ty =
-                vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size());
+            let wrapper_ty = self.arch_storage_ty(vec_ty);
             assoc_types.push(quote! {
                 type #ty_ident = #wrapper_ty;
             });
@@ -90,6 +96,19 @@ pub(crate) trait Level {
         }
     }
 
+    fn should_impl_arch_type_conversion(&self, ty: &VecType) -> bool {
+        let n_bits = ty.n_bits();
+        n_bits <= self.max_block_size() && n_bits >= self.native_width()
+    }
+
+    fn should_use_bitmask_arch_type_conversion(&self, _ty: &VecType) -> bool {
+        false
+    }
+
+    fn custom_arch_type_conversion(&self, _ty: &VecType) -> Option<TokenStream> {
+        None
+    }
+
     fn make_simd_impl(&self) -> TokenStream {
         let level_tok = self.token();
         let native_width = self.native_width();
@@ -180,19 +199,40 @@ pub(crate) trait Level {
     }
 
     fn make_type_impl(&self) -> TokenStream {
-        let native_width = self.native_width();
-        let max_block_size = self.max_block_size();
         let mut result = vec![];
         for ty in SIMD_TYPES {
-            let n_bits = ty.n_bits();
             // If n_bits is below our native width (e.g. 128 bits for AVX2), another module will have already
             // implemented the conversion.
-            if n_bits > max_block_size || n_bits < native_width {
+            if !self.should_impl_arch_type_conversion(ty) {
                 continue;
             }
             let simd = ty.rust();
             let arch = self.arch_ty(ty);
-            result.push(quote! {
+            let type_impl = if let Some(type_impl) = self.custom_arch_type_conversion(ty) {
+                type_impl
+            } else if self.should_use_bitmask_arch_type_conversion(ty) {
+                assert_eq!(
+                    ty.scalar,
+                    ScalarType::Mask,
+                    "bitmask arch type conversions are only valid for mask types"
+                );
+                quote! {
+                    impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
+                        #[inline(always)]
+                        fn simd_from(simd: S, arch: #arch) -> Self {
+                            Self::from_bitmask(simd, u64::from(arch))
+                        }
+                    }
+                    impl<S: Simd> From<#simd<S>> for #arch {
+                        #[inline(always)]
+                        #[allow(trivial_numeric_casts, reason = "generated uniformly for all __mmask widths")]
+                        fn from(value: #simd<S>) -> Self {
+                            value.to_bitmask() as #arch
+                        }
+                    }
+                }
+            } else {
+                quote! {
                 impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
                     #[inline(always)]
                     fn simd_from(simd: S, arch: #arch) -> Self {
@@ -208,7 +248,9 @@ pub(crate) trait Level {
                         unsafe { core::mem::transmute_copy(&value.val) }
                     }
                 }
-            });
+                }
+            };
+            result.push(type_impl);
         }
         quote! {
             #( #result )*
diff --git a/fearless_simd_gen/src/main.rs b/fearless_simd_gen/src/main.rs
index 10efdfd99..57df1ba3a 100644
--- a/fearless_simd_gen/src/main.rs
+++ b/fearless_simd_gen/src/main.rs
@@ -36,6 +36,7 @@ enum Module {
     Fallback,
     Sse4_2,
     Avx2,
+    Avx512,
 }
 
 #[derive(Parser)]
@@ -66,6 +67,7 @@ impl Module {
             Self::Fallback => mk_fallback::Fallback.make_module(),
             Self::Sse4_2 => mk_x86::X86::Sse4_2.make_module(),
             Self::Avx2 => mk_x86::X86::Avx2.make_module(),
+            Self::Avx512 => mk_x86::X86::Avx512.make_module(),
         }
     }
 
@@ -105,6 +107,7 @@ impl Module {
             Self::Wasm => "wasm",
             Self::Sse4_2 => "sse4_2",
             Self::Avx2 => "avx2",
+            Self::Avx512 => "avx512",
         }
     }
 }
@@ -118,6 +121,7 @@ const MODULES: &[Module] = &[
     Module::Wasm,
     Module::Sse4_2,
     Module::Avx2,
+    Module::Avx512,
 ];
 
 const FILE_BASE: &str = "./fearless_simd/src/generated";
diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
index fb118cf49..a0b069dc9 100644
--- a/fearless_simd_gen/src/mk_simd_trait.rs
+++ b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -43,8 +43,8 @@ pub(crate) fn mk_simd_trait() -> TokenStream {
         /// # Associated Types
         ///
         /// The trait defines associated types for the highest "native" vector width of each scalar type (e.g. `f32s`,
-        /// `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits everywhere but
-        /// AVX2, where they are 256 bits.
+        /// `u32s`). These are always at least 128 bits, but may be larger. Currently, they are 128 bits on the
+        /// fallback, NEON, WASM, and SSE4.2 backends, 256 bits on AVX2, and 512 bits on AVX-512.
         ///
         /// # Example
         ///
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index d3c3e3b8b..420e8fcb7 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -21,13 +21,17 @@ use quote::{ToTokens as _, format_ident, quote};
 pub(crate) enum X86 {
     Sse4_2,
     Avx2,
+    Avx512,
 }
 
+pub(crate) const AVX512_FEATURES: &str = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves";
+
 impl Level for X86 {
     fn name(&self) -> &'static str {
         match self {
             Self::Sse4_2 => "Sse4_2",
             Self::Avx2 => "Avx2",
+            Self::Avx512 => "Avx512",
         }
     }
 
@@ -35,6 +39,7 @@ impl Level for X86 {
         match self {
             Self::Sse4_2 => 128,
             Self::Avx2 => 256,
+            Self::Avx512 => 512,
         }
     }
 
@@ -46,16 +51,18 @@ impl Level for X86 {
         Some(match self {
             Self::Sse4_2 => "sse4.2,cmpxchg16b,popcnt",
             Self::Avx2 => "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave",
+            Self::Avx512 => AVX512_FEATURES,
         })
     }
 
     fn arch_ty(&self, vec_ty: &VecType) -> TokenStream {
-        // Future AVX-512 backends should be able to keep mask types opaque by storing them as
-        // `__mmask*` predicate registers instead of `__m*i` vectors: for example, `mask8x64`
-        // maps naturally to `__mmask64`, `mask16x32` to `__mmask32`, and `mask32x16`/`mask64x8`
-        // to `__mmask16`/`__mmask8`. Comparisons would return `_mm512_cmp*_mask`, selects would
-        // use `_mm512_mask_blend_*`, and legacy integer-lane interop could materialize vectors
-        // with `_mm512_movm_epi*` only at the API boundary.
+        // AVX-512 masks are compact predicate registers, not vector registers.
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let bits = avx512_mask_register_bits(vec_ty);
+            let name = format!("__mmask{bits}");
+            return Ident::new(&name, Span::call_site()).into_token_stream();
+        }
+
         let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) {
             (ScalarType::Float, 32) => "",
             (ScalarType::Float, 64) => "d",
@@ -66,6 +73,14 @@ impl Level for X86 {
         Ident::new(&name, Span::call_site()).into_token_stream()
     }
 
+    fn arch_storage_ty(&self, vec_ty: &VecType) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            self.arch_ty(vec_ty)
+        } else {
+            vec_ty.aligned_wrapper_ty(|vec_ty| self.arch_ty(vec_ty), self.max_block_size())
+        }
+    }
+
     fn token_doc(&self) -> &'static str {
         match self {
             Self::Sse4_2 => {
@@ -74,6 +89,9 @@ impl Level for X86 {
             Self::Avx2 => {
                 "A token for AVX2 intrinsics on `x86` and `x86_64`, representing the x86-64-v3 level."
             }
+            Self::Avx512 => {
+                "A token for AVX-512 intrinsics on `x86` and `x86_64`, representing an Ice Lake feature level."
+            }
         }
     }
 
@@ -91,6 +109,7 @@ impl Level for X86 {
         let slide_helpers = match self {
             Self::Sse4_2 => Self::sse42_slide_helpers(),
             Self::Avx2 => Self::avx2_slide_helpers(),
+            Self::Avx512 => TokenStream::new(),
         };
 
         quote! {
@@ -135,7 +154,50 @@ impl Level for X86 {
             Self::Avx2 => quote! {
                 Level::#level_tok(self)
             },
+            Self::Avx512 => quote! {
+                Level::#level_tok(self)
+            },
+        }
+    }
+
+    fn should_impl_arch_type_conversion(&self, ty: &VecType) -> bool {
+        let n_bits = ty.n_bits();
+        if *self == Self::Avx512 && ty.scalar == ScalarType::Mask {
+            return n_bits <= self.max_block_size();
         }
+        n_bits <= self.max_block_size() && n_bits >= self.native_width()
+    }
+
+    fn should_use_bitmask_arch_type_conversion(&self, ty: &VecType) -> bool {
+        *self == Self::Avx512 && ty.scalar == ScalarType::Mask
+    }
+
+    fn custom_arch_type_conversion(&self, ty: &VecType) -> Option<TokenStream> {
+        if *self == Self::Avx512 || ty.scalar != ScalarType::Mask {
+            return None;
+        }
+
+        let simd = ty.rust();
+        let arch = self.arch_ty(ty);
+        let lane_ty = ScalarType::Int.rust(ty.scalar_bits);
+        let len = ty.len;
+
+        Some(quote! {
+            impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
+                #[inline(always)]
+                fn simd_from(simd: S, arch: #arch) -> Self {
+                    let lanes: [#lane_ty; #len] = unsafe { core::mem::transmute_copy(&arch) };
+                    lanes.simd_into(simd)
+                }
+            }
+            impl<S: Simd> From<#simd<S>> for #arch {
+                #[inline(always)]
+                fn from(value: #simd<S>) -> Self {
+                    let lanes: [#lane_ty; #len] = value.into();
+                    unsafe { core::mem::transmute_copy(&lanes) }
+                }
+            }
+        })
     }
 
     fn make_impl_body(&self) -> TokenStream {
@@ -165,10 +227,45 @@ impl Level for X86 {
                     Self { _private: () }
                 }
             },
+            Self::Avx512 => quote! {
+                /// Create a SIMD token.
+                ///
+                /// # Safety
+                ///
+                /// The Ice Lake AVX-512 CPU feature set must be available.
+                #[inline]
+                pub const unsafe fn new_unchecked() -> Self {
+                    Self { _private: () }
+                }
+            },
         }
     }
 
     fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool {
+        if *self == Self::Avx512
+            && vec_ty.scalar == ScalarType::Float
+            && vec_ty.n_bits() == 512
+            && matches!(
+                op.method,
+                "floor" | "ceil" | "round_ties_even" | "trunc" | "approximate_recip"
+            )
+        {
+            return true;
+        }
+
+        if *self == Self::Avx512
+            && matches!(
+                op.sig,
+                OpSig::Slide {
+                    granularity: SlideGranularity::WithinBlocks,
+                    ..
+                }
+            )
+            && vec_ty.n_bits() > 128
+        {
+            return true;
+        }
+
         let should_use_generic = op.sig.should_use_generic_op(vec_ty, self.native_width());
         if !should_use_generic {
             return false;
@@ -224,7 +321,17 @@ impl Level for X86 {
                 block_size,
                 block_count,
             } => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count),
+            OpSig::FromArray { kind }
+                if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask =>
+            {
+                self.handle_avx512_mask_from_array(method_sig, vec_ty, kind)
+            }
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
+            OpSig::AsArray { kind }
+                if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask =>
+            {
+                self.handle_avx512_mask_as_array(method_sig, vec_ty, kind)
+            }
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
                     self.arch_ty(vec_ty)
@@ -593,8 +700,181 @@ fn signed_literal(value: u64, bits: u32) -> TokenStream {
     }
 }
 
+fn avx512_mask_register_bits(vec_ty: &VecType) -> usize {
+    match vec_ty.len {
+        0..=8 => 8,
+        9..=16 => 16,
+        17..=32 => 32,
+        33..=64 => 64,
+        _ => unreachable!("SIMD masks never have more than 64 lanes"),
+    }
+}
+
+fn avx512_mask_lane_bits(vec_ty: &VecType) -> TokenStream {
+    let bits = if vec_ty.len == 64 {
+        quote! { u64::MAX }
+    } else {
+        let bits = (1_u64 << vec_ty.len) - 1;
+        quote! { #bits }
+    };
+    bits
+}
+
+fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
+    let ty = vec_ty.rust();
+    let bits = if avx512_mask_register_bits(vec_ty) == 64 {
+        bits
+    } else {
+        quote! { (#bits) as _ }
+    };
+    quote! {
+        #ty {
+            val: #bits,
+            simd: self,
+        }
+    }
+}
+
+fn avx512_mask_register_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
+    let ty = vec_ty.rust();
+    quote! {
+        #ty {
+            val: #bits,
+            simd: self,
+        }
+    }
+}
+
+fn avx512_mask_bits_expr(expr: TokenStream) -> TokenStream {
+    quote! { u64::from((#expr).val) }
+}
+
+fn avx512_compare_op(method: &str) -> &'static str {
+    match method {
+        "simd_eq" => "cmpeq",
+        "simd_lt" => "cmplt",
+        "simd_le" => "cmple",
+        "simd_ge" => "cmpge",
+        "simd_gt" => "cmpgt",
+        _ => unreachable!(),
+    }
+}
+
+fn avx512_float_compare_predicate(method: &str) -> i32 {
+    match method {
+        "simd_eq" => 0x00,
+        "simd_lt" => 0x11,
+        "simd_le" => 0x12,
+        "simd_ge" => 0x1D,
+        "simd_gt" => 0x1E,
+        "ord" => 0x07,
+        "unord" => 0x03,
+        _ => unreachable!(),
+    }
+}
+
+fn avx512_mask_compare_expr(method: &str, vec_ty: &VecType) -> TokenStream {
+    let lane_mask = avx512_mask_lane_bits(vec_ty);
+    match method {
+        "simd_eq" => quote! { !u64::from(a.val ^ b.val) & #lane_mask },
+        _ => unreachable!("masks only support equality comparison"),
+    }
+}
+
+fn avx512_permutex2var_intrinsic(vec_ty: &VecType) -> Ident {
+    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+    intrinsic_ident("permutex2var", suffix, vec_ty.n_bits())
+}
+
+fn avx512_mask_blend_intrinsic(vec_ty: &VecType) -> Ident {
+    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+    intrinsic_ident("mask_blend", suffix, vec_ty.n_bits())
+}
+
+fn avx512_index_vector(vec_ty: &VecType, indices: impl IntoIterator<Item = usize>) -> TokenStream {
+    let indices: Vec<usize> = indices.into_iter().collect();
+    let n_bits = vec_ty.n_bits();
+    let scalar_bits = vec_ty.scalar_bits;
+    match (n_bits, scalar_bits) {
+        (128, 8) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 8));
+            quote! { _mm_setr_epi8(#(#lanes),*) }
+        }
+        (256, 8) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 8));
+            quote! { _mm256_setr_epi8(#(#lanes),*) }
+        }
+        (512, 8) => {
+            let lanes = indices
+                .into_iter()
+                .rev()
+                .map(|i| signed_literal(i as u64, 8));
+            quote! { _mm512_set_epi8(#(#lanes),*) }
+        }
+        (128, 16) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 16));
+            quote! { _mm_setr_epi16(#(#lanes),*) }
+        }
+        (256, 16) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 16));
+            quote! { _mm256_setr_epi16(#(#lanes),*) }
+        }
+        (512, 16) => {
+            let lanes = indices
+                .into_iter()
+                .rev()
+                .map(|i| signed_literal(i as u64, 16));
+            quote! { _mm512_set_epi16(#(#lanes),*) }
+        }
+        (128, 32) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32));
+            quote! { _mm_setr_epi32(#(#lanes),*) }
+        }
+        (256, 32) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32));
+            quote! { _mm256_setr_epi32(#(#lanes),*) }
+        }
+        (512, 32) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 32));
+            quote! { _mm512_setr_epi32(#(#lanes),*) }
+        }
+        (128, 64) => {
+            let mut lanes = indices
+                .into_iter()
+                .map(|i| signed_literal(i as u64, 64))
+                .collect::<Vec<_>>();
+            lanes.reverse();
+            quote! { _mm_set_epi64x(#(#lanes),*) }
+        }
+        (256, 64) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 64));
+            quote! { _mm256_setr_epi64x(#(#lanes),*) }
+        }
+        (512, 64) => {
+            let lanes = indices.into_iter().map(|i| signed_literal(i as u64, 64));
+            quote! { _mm512_setr_epi64(#(#lanes),*) }
+        }
+        _ => unreachable!(),
+    }
+}
+
 impl X86 {
     pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let result = avx512_mask_value(
+                vec_ty,
+                quote! {
+                    if val { #lane_mask } else { 0 }
+                },
+            );
+            return quote! {
+                #method_sig {
+                    #result
+                }
+            };
+        }
+
         let intrinsic = set1_intrinsic(vec_ty);
         let cast = match vec_ty.scalar {
             ScalarType::Unsigned => quote!(.cast_signed()),
@@ -612,6 +892,9 @@ impl X86 {
     }
 
     fn has_specialized_mask_from_bitmask(&self, vec_ty: &VecType) -> bool {
+        if *self == Self::Avx512 {
+            return true;
+        }
         self.has_wide_byte_mask_from_bitmask(vec_ty) || self.has_wide_avx2_mask_from_bitmask(vec_ty)
     }
 
@@ -631,9 +914,62 @@ impl X86 {
     }
 
     fn has_specialized_mask_to_bitmask(&self, vec_ty: &VecType) -> bool {
+        if *self == Self::Avx512 {
+            return true;
+        }
         vec_ty.scalar == ScalarType::Mask && vec_ty.scalar_bits == 16
     }
 
+    pub(crate) fn handle_avx512_mask_from_array(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        kind: crate::ops::RefKind,
+    ) -> TokenStream {
+        assert_eq!(vec_ty.scalar, ScalarType::Mask);
+        let len = vec_ty.len;
+        let val_ref = if kind == crate::ops::RefKind::Value {
+            quote! { &val }
+        } else {
+            quote! { val }
+        };
+        let result = avx512_mask_value(vec_ty, quote! { bits });
+        quote! {
+            #method_sig {
+                let val = #val_ref;
+                let mut bits = 0u64;
+                let mut i = 0usize;
+                while i < #len {
+                    if val[i] != 0 {
+                        bits |= 1u64 << i;
+                    }
+                    i += 1;
+                }
+                #result
+            }
+        }
+    }
+
+    pub(crate) fn handle_avx512_mask_as_array(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        kind: crate::ops::RefKind,
+    ) -> TokenStream {
+        assert_eq!(vec_ty.scalar, ScalarType::Mask);
+        assert!(
+            kind == crate::ops::RefKind::Value,
+            "mask array references are not exposed"
+        );
+        let bits = avx512_mask_bits_expr(quote! { a });
+        quote! {
+            #method_sig {
+                let bits = #bits;
+                core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+            }
+        }
+    }
+
     pub(crate) fn handle_mask_from_bitmask(
         &self,
         method_sig: TokenStream,
@@ -645,6 +981,16 @@ impl X86 {
             "mask bitmask conversion only operates on masks"
         );
 
+        if *self == Self::Avx512 {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let result = avx512_mask_value(vec_ty, quote! { bits & #lane_mask });
+            return quote! {
+                #method_sig {
+                    #result
+                }
+            };
+        }
+
         if self.has_wide_byte_mask_from_bitmask(vec_ty) {
             let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty);
             return quote! {
@@ -703,6 +1049,16 @@ impl X86 {
             "mask bitmask conversion only operates on masks"
         );
 
+        if *self == Self::Avx512 {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let bits = avx512_mask_bits_expr(quote! { a });
+            return quote! {
+                #method_sig {
+                    #bits & #lane_mask
+                }
+            };
+        }
+
         match vec_ty.scalar_bits {
             8 => {
                 let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8);
@@ -749,6 +1105,39 @@ impl X86 {
         method: &str,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 {
+            if vec_ty.scalar == ScalarType::Mask {
+                let expr = avx512_mask_compare_expr(method, vec_ty);
+                let result = avx512_mask_value(vec_ty, expr);
+                return quote! {
+                    #method_sig {
+                        #result
+                    }
+                };
+            }
+
+            let mask_ty = vec_ty.mask_ty();
+            let result = if vec_ty.scalar == ScalarType::Float {
+                let predicate = avx512_float_compare_predicate(method);
+                let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+                let intrinsic = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits());
+                avx512_mask_register_value(
+                    &mask_ty,
+                    quote! { #intrinsic::<#predicate>(a.into(), b.into()) },
+                )
+            } else {
+                let cmp = avx512_compare_op(method);
+                let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true);
+                let intrinsic = intrinsic_ident(cmp, &format!("{suffix}_mask"), vec_ty.n_bits());
+                avx512_mask_register_value(&mask_ty, quote! { #intrinsic(a.into(), b.into()) })
+            };
+            return quote! {
+                #method_sig {
+                    unsafe { #result }
+                }
+            };
+        }
+
         let args = [quote! { a.into() }, quote! { b.into() }];
 
         let expr = if vec_ty.scalar != ScalarType::Float {
@@ -830,6 +1219,23 @@ impl X86 {
         method: &str,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let body = match method {
+                "not" => {
+                    let lane_mask = avx512_mask_lane_bits(vec_ty);
+                    let bits = avx512_mask_bits_expr(quote! { a });
+                    let result = avx512_mask_value(vec_ty, quote! { (!#bits) & #lane_mask });
+                    quote! { #result }
+                }
+                _ => unreachable!(),
+            };
+            return quote! {
+                #method_sig {
+                    #body
+                }
+            };
+        }
+
         match method {
             "fract" => {
                 let trunc_op = generic_op_name("trunc", vec_ty);
@@ -885,7 +1291,20 @@ impl X86 {
         let expr = match method {
             "widen" => {
                 match (self, dst_width, vec_ty.n_bits()) {
-                    (Self::Avx2, 256, 128) => {
+                    (Self::Avx2 | Self::Avx512, 256, 128) => {
+                        let extend = extend_intrinsic(
+                            vec_ty.scalar,
+                            vec_ty.scalar_bits,
+                            target_ty.scalar_bits,
+                            dst_width,
+                        );
+                        quote! {
+                            unsafe {
+                                #extend(a.into()).simd_into(self)
+                            }
+                        }
+                    }
+                    (Self::Avx512, 512, 256) => {
                         let extend = extend_intrinsic(
                             vec_ty.scalar,
                             vec_ty.scalar_bits,
@@ -946,6 +1365,14 @@ impl X86 {
             }
             "narrow" => {
                 match (self, dst_width, vec_ty.n_bits()) {
+                    (Self::Avx512, 128, 256) | (Self::Avx512, 256, 512) => {
+                        let narrow = intrinsic_ident("cvtepi16", "epi8", vec_ty.n_bits());
+                        quote! {
+                            unsafe {
+                                #narrow(a.into()).simd_into(self)
+                            }
+                        }
+                    }
                     (Self::Avx2, 128, 256) => {
                         let mask = match target_ty.scalar_bits {
                             8 => {
@@ -1034,6 +1461,52 @@ impl X86 {
         method: &str,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let a_bits = avx512_mask_bits_expr(quote! { a });
+            let b_bits = avx512_mask_bits_expr(quote! { b });
+            let expr = match method {
+                "and" => quote! { (#a_bits & #b_bits) & #lane_mask },
+                "or" => quote! { (#a_bits | #b_bits) & #lane_mask },
+                "xor" => quote! { (#a_bits ^ #b_bits) & #lane_mask },
+                _ => unreachable!(),
+            };
+            let result = avx512_mask_value(vec_ty, expr);
+            return quote! {
+                #method_sig {
+                    #result
+                }
+            };
+        }
+
+        if *self == Self::Avx512
+            && vec_ty.scalar == ScalarType::Float
+            && matches!(method, "min_precise" | "max_precise")
+        {
+            let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true);
+            let minmax = intrinsic_ident(
+                if method == "max_precise" {
+                    "max"
+                } else {
+                    "min"
+                },
+                suffix,
+                vec_ty.n_bits(),
+            );
+            let cmp = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits());
+            let blend = avx512_mask_blend_intrinsic(vec_ty);
+            let unord = avx512_float_compare_predicate("unord");
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        let intermediate = #minmax(a.into(), b.into());
+                        let b_is_nan = #cmp::<#unord>(b.into(), b.into());
+                        #blend(b_is_nan, intermediate, a.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         let body = match method {
             "mul" if vec_ty.scalar_bits == 8 => {
                 // https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t
@@ -1052,7 +1525,9 @@ impl X86 {
                     }
                 }
             }
-            "shlv" | "shrv" if *self == Self::Avx2 && vec_ty.scalar_bits >= 32 => {
+            "shlv" | "shrv"
+                if matches!(self, Self::Avx2 | Self::Avx512) && vec_ty.scalar_bits >= 32 =>
+            {
                 let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
                 let name = match (method, vec_ty.scalar) {
                     ("shrv", ScalarType::Int) => "srav",
@@ -1112,9 +1587,16 @@ impl X86 {
                     #expr(val, #set0())
                 },
                 ScalarType::Int => {
-                    let cmp_intrinsic = intrinsic_ident("cmpgt", "epi8", ty_bits);
+                    let sign_bits = if *self == Self::Avx512 && ty_bits == 512 {
+                        quote! {
+                            _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(#set0(), val))
+                        }
+                    } else {
+                        let cmp_intrinsic = intrinsic_ident("cmpgt", "epi8", ty_bits);
+                        quote! { #cmp_intrinsic(#set0(), val) }
+                    };
                     quote! {
-                        #expr(val, #cmp_intrinsic(#set0(), val))
+                        #expr(val, #sign_bits)
                     }
                 }
                 _ => unimplemented!(),
@@ -1156,7 +1638,7 @@ impl X86 {
         vec_ty: &VecType,
     ) -> TokenStream {
         match method {
-            "mul_add" if *self == Self::Avx2 => {
+            "mul_add" if matches!(self, Self::Avx2 | Self::Avx512) => {
                 let intrinsic = simple_intrinsic("fmadd", vec_ty);
                 quote! {
                     #method_sig {
@@ -1164,7 +1646,7 @@ impl X86 {
                     }
                 }
             }
-            "mul_sub" if *self == Self::Avx2 => {
+            "mul_sub" if matches!(self, Self::Avx2 | Self::Avx512) => {
                 let intrinsic = simple_intrinsic("fmsub", vec_ty);
                 quote! {
                     #method_sig {
@@ -1204,6 +1686,33 @@ impl X86 {
     }
 
     pub(crate) fn handle_select(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+        if *self == Self::Avx512 {
+            if vec_ty.scalar == ScalarType::Mask {
+                let lane_mask = avx512_mask_lane_bits(vec_ty);
+                let a_bits = avx512_mask_bits_expr(quote! { a });
+                let b_bits = avx512_mask_bits_expr(quote! { b });
+                let c_bits = avx512_mask_bits_expr(quote! { c });
+                let result = avx512_mask_value(
+                    vec_ty,
+                    quote! { ((#a_bits & #b_bits) | ((!#a_bits) & #c_bits)) & #lane_mask },
+                );
+                return quote! {
+                    #method_sig {
+                        #result
+                    }
+                };
+            }
+
+            let blend = avx512_mask_blend_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #blend(a.val, c.into(), b.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask
         let args = [
             quote! { c.into() },
@@ -1237,7 +1746,49 @@ impl X86 {
         vec_ty: &VecType,
         half_ty: &VecType,
     ) -> TokenStream {
-        if *self == Self::Avx2 && half_ty.n_bits() == 128 {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let half_rust = half_ty.rust();
+            let half_len = half_ty.len;
+            let half_mask = avx512_mask_lane_bits(half_ty);
+            return quote! {
+                #method_sig {
+                    let bits = u64::from(a.val);
+                    (
+                        #half_rust { val: (bits & #half_mask) as _, simd: self },
+                        #half_rust { val: ((bits >> #half_len) & #half_mask) as _, simd: self },
+                    )
+                }
+            };
+        }
+
+        if *self == Self::Avx512 && half_ty.n_bits() == 256 {
+            let (lo, hi) = match vec_ty.scalar {
+                ScalarType::Float if vec_ty.scalar_bits == 32 => (
+                    quote! { _mm512_castps512_ps256(a.into()) },
+                    quote! { _mm512_extractf32x8_ps::<1>(a.into()) },
+                ),
+                ScalarType::Float if vec_ty.scalar_bits == 64 => (
+                    quote! { _mm512_castpd512_pd256(a.into()) },
+                    quote! { _mm512_extractf64x4_pd::<1>(a.into()) },
+                ),
+                _ => (
+                    quote! { _mm512_castsi512_si256(a.into()) },
+                    quote! { _mm512_extracti64x4_epi64::<1>(a.into()) },
+                ),
+            };
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        (
+                            #lo.simd_into(self),
+                            #hi.simd_into(self),
+                        )
+                    }
+                }
+            };
+        }
+
+        if matches!(self, Self::Avx2 | Self::Avx512) && half_ty.n_bits() == 128 {
             let extract_op = match vec_ty.scalar {
                 ScalarType::Float => "extractf128",
                 _ => "extracti128",
@@ -1264,7 +1815,45 @@ impl X86 {
         vec_ty: &VecType,
         combined_ty: &VecType,
     ) -> TokenStream {
-        if *self == Self::Avx2 && combined_ty.n_bits() == 256 {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
+            let combined_rust = combined_ty.rust();
+            let shift = vec_ty.len;
+            let lane_mask = avx512_mask_lane_bits(combined_ty);
+            let bits = if avx512_mask_register_bits(combined_ty) == 64 {
+                quote! { bits }
+            } else {
+                quote! { bits as _ }
+            };
+            return quote! {
+                #method_sig {
+                    let bits = (u64::from(a.val) | (u64::from(b.val) << #shift)) & #lane_mask;
+                    #combined_rust { val: #bits, simd: self }
+                }
+            };
+        }
+
+        if *self == Self::Avx512 && combined_ty.n_bits() == 512 {
+            let expr = match vec_ty.scalar {
+                ScalarType::Float if vec_ty.scalar_bits == 32 => quote! {
+                    _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into())
+                },
+                ScalarType::Float if vec_ty.scalar_bits == 64 => quote! {
+                    _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into())
+                },
+                _ => quote! {
+                    _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into())
+                },
+            };
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #expr.simd_into(self)
+                    }
+                }
+            };
+        }
+
+        if matches!(self, Self::Avx2 | Self::Avx512) && combined_ty.n_bits() == 256 {
             let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) {
                 (ScalarType::Float, 32) => "m128",
                 (ScalarType::Float, 64) => "m128d",
@@ -1289,6 +1878,27 @@ impl X86 {
         vec_ty: &VecType,
         select_low: bool,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let offset = if select_low { 0 } else { vec_ty.len / 2 };
+            let indices = (0..vec_ty.len).map(|i| {
+                let source_lane = offset + (i / 2);
+                if i % 2 == 0 {
+                    source_lane
+                } else {
+                    vec_ty.len + source_lane
+                }
+            });
+            let idx = avx512_index_vector(vec_ty, indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #permute(a.into(), #idx, b.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         let expr = match vec_ty.n_bits() {
             128 => {
                 let op = if select_low { "unpacklo" } else { "unpackhi" };
@@ -1342,6 +1952,40 @@ impl X86 {
         method_sig: TokenStream,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let lo_indices = (0..vec_ty.len).map(|i| {
+                let source_lane = i / 2;
+                if i % 2 == 0 {
+                    source_lane
+                } else {
+                    vec_ty.len + source_lane
+                }
+            });
+            let hi_indices = (0..vec_ty.len).map(|i| {
+                let source_lane = (vec_ty.len / 2) + (i / 2);
+                if i % 2 == 0 {
+                    source_lane
+                } else {
+                    vec_ty.len + source_lane
+                }
+            });
+            let lo_idx = avx512_index_vector(vec_ty, lo_indices);
+            let hi_idx = avx512_index_vector(vec_ty, hi_indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        let a = a.into();
+                        let b = b.into();
+                        (
+                            #permute(a, #lo_idx, b).simd_into(self),
+                            #permute(a, #hi_idx, b).simd_into(self),
+                        )
+                    }
+                }
+            };
+        }
+
         match vec_ty.n_bits() {
             256 => {
                 // Optimized path: compute unpacklo and unpackhi once, then use permute2f128 to
@@ -1390,6 +2034,38 @@ impl X86 {
         method_sig: TokenStream,
         vec_ty: &VecType,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let even_indices = (0..vec_ty.len).map(|i| {
+                if i < vec_ty.len / 2 {
+                    i * 2
+                } else {
+                    vec_ty.len + ((i - vec_ty.len / 2) * 2)
+                }
+            });
+            let odd_indices = (0..vec_ty.len).map(|i| {
+                if i < vec_ty.len / 2 {
+                    i * 2 + 1
+                } else {
+                    vec_ty.len + ((i - vec_ty.len / 2) * 2 + 1)
+                }
+            });
+            let even_idx = avx512_index_vector(vec_ty, even_indices);
+            let odd_idx = avx512_index_vector(vec_ty, odd_indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        let a = a.into();
+                        let b = b.into();
+                        (
+                            #permute(a, #even_idx, b).simd_into(self),
+                            #permute(a, #odd_idx, b).simd_into(self),
+                        )
+                    }
+                }
+            };
+        }
+
         match vec_ty.n_bits() {
             256 => {
                 // Optimized path: compute the per-input shuffles once, then use permute2f128 /
@@ -1482,6 +2158,26 @@ impl X86 {
         vec_ty: &VecType,
         select_even: bool,
     ) -> TokenStream {
+        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+            let lane_offset = if select_even { 0 } else { 1 };
+            let indices = (0..vec_ty.len).map(|i| {
+                if i < vec_ty.len / 2 {
+                    i * 2 + lane_offset
+                } else {
+                    vec_ty.len + ((i - vec_ty.len / 2) * 2 + lane_offset)
+                }
+            });
+            let idx = avx512_index_vector(vec_ty, indices);
+            let permute = avx512_permutex2var_intrinsic(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #permute(a.into(), #idx, b.into()).simd_into(self)
+                    }
+                }
+            };
+        }
+
         let expr = match (vec_ty.scalar, vec_ty.n_bits(), vec_ty.scalar_bits) {
             (ScalarType::Float, 128, _) => {
                 // 128-bit shuffle of floats or doubles; there are built-in SSE intrinsics for this
@@ -1588,6 +2284,37 @@ impl X86 {
         let to_bytes = generic_op_name("cvt_to_bytes", vec_ty);
         let from_bytes = generic_op_name("cvt_from_bytes", vec_ty);
 
+        if *self == Self::Avx512 && granularity == AcrossBlocks && vec_ty.n_bits() >= 256 {
+            let byte_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8);
+            let base_idx = avx512_index_vector(&byte_ty, 0..byte_ty.len);
+            let set_shift = set1_intrinsic(&byte_ty);
+            let add = simple_sign_unaware_intrinsic("add", &byte_ty);
+            let permute = avx512_permutex2var_intrinsic(&byte_ty);
+            let byte_shift = if scalar_bytes == 1 {
+                quote! { SHIFT }
+            } else {
+                quote! { SHIFT * #scalar_bytes }
+            };
+
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        if SHIFT >= #max_shift {
+                            return b;
+                        }
+
+                        let idx = #add(#base_idx, #set_shift((#byte_shift) as i8));
+                        let result = #permute(
+                            self.#to_bytes(a).val.0,
+                            idx,
+                            self.#to_bytes(b).val.0,
+                        );
+                        self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
+                    }
+                }
+            };
+        }
+
         let alignr_op = match (granularity, vec_ty.n_bits(), self) {
             (WithinBlocks, 128, _) => {
                 panic!("This should have been handled by generic_op");
@@ -1641,6 +2368,97 @@ impl X86 {
             vec_ty.scalar_bits, target_scalar_bits,
             "we currently only support converting between types of the same width"
         );
+        if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
+            let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
+            let expr = match (vec_ty.scalar, target_scalar) {
+                (ScalarType::Float, ScalarType::Int) => {
+                    let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits());
+                    if precise {
+                        let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
+                        let blend = avx512_mask_blend_intrinsic(&target_ty);
+                        let set1_float = set1_intrinsic(vec_ty);
+                        let set1_int = set1_intrinsic(&target_ty);
+                        let set0_int =
+                            intrinsic_ident("setzero", coarse_type(&target_ty), target_ty.n_bits());
+                        let lt = avx512_float_compare_predicate("simd_lt");
+                        let ord = avx512_float_compare_predicate("ord");
+                        quote! {
+                            unsafe {
+                                let a = a.into();
+                                let mut converted = #convert(a);
+                                let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0));
+                                converted = #blend(in_range, #set1_int(i32::MAX), converted);
+                                let is_not_nan = #cmp::<#ord>(a, a);
+                                converted = #blend(is_not_nan, #set0_int(), converted);
+                                converted.simd_into(self)
+                            }
+                        }
+                    } else {
+                        quote! {
+                            unsafe {
+                                #convert(a.into()).simd_into(self)
+                            }
+                        }
+                    }
+                }
+                (ScalarType::Float, ScalarType::Unsigned) => {
+                    let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits());
+                    if precise {
+                        let max = simple_intrinsic("max", vec_ty);
+                        let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
+                        let blend = avx512_mask_blend_intrinsic(&target_ty);
+                        let set1_float = set1_intrinsic(vec_ty);
+                        let set1_int = set1_intrinsic(&target_ty);
+                        let set0_float =
+                            intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits());
+                        let lt = avx512_float_compare_predicate("simd_lt");
+                        quote! {
+                            unsafe {
+                                let a = #max(a.into(), #set0_float());
+                                let mut converted = #convert(a);
+                                let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a);
+                                converted = #blend(
+                                    exceeds_unsigned_range,
+                                    converted,
+                                    #set1_int(u32::MAX.cast_signed()),
+                                );
+                                converted.simd_into(self)
+                            }
+                        }
+                    } else {
+                        quote! {
+                            unsafe {
+                                #convert(a.into()).simd_into(self)
+                            }
+                        }
+                    }
+                }
+                (ScalarType::Int, ScalarType::Float) => {
+                    let intrinsic = simple_intrinsic("cvtepi32", &target_ty);
+                    quote! {
+                        unsafe {
+                            #intrinsic(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                (ScalarType::Unsigned, ScalarType::Float) => {
+                    let intrinsic = simple_intrinsic("cvtepu32", &target_ty);
+                    quote! {
+                        unsafe {
+                            #intrinsic(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                _ => unimplemented!(),
+            };
+
+            return quote! {
+                #method_sig {
+                    #expr
+                }
+            };
+        }
+
         let expr = match (vec_ty.scalar, target_scalar) {
             (ScalarType::Float, ScalarType::Int | ScalarType::Unsigned) => {
                 let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
@@ -1865,6 +2683,23 @@ impl X86 {
             "mask reduce ops only operate on masks"
         );
 
+        if *self == Self::Avx512 {
+            let lane_mask = avx512_mask_lane_bits(vec_ty);
+            let bits = avx512_mask_bits_expr(quote! { a });
+            let expr = match (quantifier, condition) {
+                (Quantifier::Any, true) => quote! { bits != 0 },
+                (Quantifier::Any, false) => quote! { bits != #lane_mask },
+                (Quantifier::All, true) => quote! { bits == #lane_mask },
+                (Quantifier::All, false) => quote! { bits == 0 },
+            };
+            return quote! {
+                #method_sig {
+                    let bits = #bits & #lane_mask;
+                    #expr
+                }
+            };
+        }
+
         let (movemask, all_ones) = match vec_ty.scalar_bits {
             32 | 64 => {
                 let float_ty = vec_ty.cast(ScalarType::Float);
@@ -2188,6 +3023,10 @@ impl X86 {
         let vec_widths: &[usize] = match self {
             Self::Sse4_2 => &[128],
             Self::Avx2 => &[128, 256],
+            // AVX-512 uses byte-wise permutex2var for 256/512-bit slide operations.
+            // It only needs the legacy alignr helper for 128-bit slides and for
+            // wider within-block slides that decompose through 128-bit lanes.
+            Self::Avx512 => &[128],
         };
 
         for vec_ty in vec_widths
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index c1129e6be..2e3e7b24b 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -1176,7 +1176,7 @@ pub(crate) const F32_TO_U32: Op = Op::new(
     },
     "Convert each floating-point element to an unsigned 32-bit integer, truncating towards zero.\n\n\
     Out-of-range values or NaN will produce implementation-defined results.\n\n\
-    On x86 platforms, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32` (at least until AVX-512, which is currently not supported).\n\
+    On x86 platforms below AVX-512, this operation will still be slower than converting to `i32`, because there is no native instruction for converting to `u32`.\n\
     If you know your values fit within range of an `i32`, you should convert to an `i32` and cast to your desired datatype afterwards.",
 );
 pub(crate) const F32_TO_U32_PRECISE: Op = Op::new(
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 19c3c88f4..ca482799a 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -701,47 +701,63 @@ fn combine_u8x16<S: Simd>(simd: S) {
 
 #[simd_test]
 fn and_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0],
+    );
     let b = mask8x16::from_slice(
         simd,
         &[
-            85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85, 85,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
         ],
     );
     assert_eq!(
         <[i8; 16]>::from(a & b),
-        [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+        [-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0]
     );
 }
 
 #[simd_test]
 fn or_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]);
-    let b = mask8x16::from_slice(simd, &[1, 1, 1, 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[0, -1, 0, -1, 0, -1, 0, -1, -1, 0, -1, 0, -1, 0, -1, 0],
+    );
+    let b = mask8x16::from_slice(
+        simd,
+        &[0, 0, -1, -1, 0, 0, -1, -1, 0, -1, 0, -1, 0, -1, 0, -1],
+    );
     assert_eq!(
         <[i8; 16]>::from(a | b),
-        [1, 1, 3, 3, 6, 7, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]
+        [0, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
     );
 }
 
 #[simd_test]
 fn xor_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 1, 1, 1, 0, 0, 0, 0]);
-    let b = mask8x16::from_slice(simd, &[1, 1, 0, 0, 5, 4, 7, 6, 1, 0, 1, 0, 1, 0, 1, 0]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[0, -1, -1, 0, -1, 0, 0, -1, -1, -1, 0, 0, -1, -1, 0, 0],
+    );
+    let b = mask8x16::from_slice(
+        simd,
+        &[-1, -1, 0, 0, -1, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0],
+    );
     assert_eq!(
         <[i8; 16]>::from(a ^ b),
-        [1, 0, 2, 3, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0]
+        [-1, 0, -1, 0, 0, -1, 0, -1, 0, -1, -1, 0, 0, -1, -1, 0]
     );
 }
 
 #[simd_test]
 fn not_mask8x16<S: Simd>(simd: S) {
-    let a = mask8x16::from_slice(simd, &[0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8]);
+    let a = mask8x16::from_slice(
+        simd,
+        &[0, -1, -1, 0, -1, 0, 0, -1, -1, 0, -1, 0, 0, -1, 0, -1],
+    );
     assert_eq!(
         <[i8; 16]>::from(!a),
-        [
-            -1, -2, -3, -4, -5, -6, -7, -8, -2, -3, -4, -5, -6, -7, -8, -9
-        ]
+        [-1, 0, 0, -1, 0, -1, -1, 0, 0, -1, 0, -1, -1, 0, -1, 0]
     );
 }
 
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index 4d2f053d8..f2c39ada3 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -12,6 +12,41 @@ use fearless_simd_dev_macros::simd_test;
 mod harness;
 mod soundness;
 
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+fn x86_detects_icelake_avx512() -> bool {
+    std::arch::is_x86_feature_detected!("adx")
+        && std::arch::is_x86_feature_detected!("aes")
+        && std::arch::is_x86_feature_detected!("avx512bitalg")
+        && std::arch::is_x86_feature_detected!("avx512bw")
+        && std::arch::is_x86_feature_detected!("avx512cd")
+        && std::arch::is_x86_feature_detected!("avx512dq")
+        && std::arch::is_x86_feature_detected!("avx512f")
+        && std::arch::is_x86_feature_detected!("avx512ifma")
+        && std::arch::is_x86_feature_detected!("avx512vbmi")
+        && std::arch::is_x86_feature_detected!("avx512vbmi2")
+        && std::arch::is_x86_feature_detected!("avx512vl")
+        && std::arch::is_x86_feature_detected!("avx512vnni")
+        && std::arch::is_x86_feature_detected!("avx512vpopcntdq")
+        && std::arch::is_x86_feature_detected!("bmi1")
+        && std::arch::is_x86_feature_detected!("bmi2")
+        && std::arch::is_x86_feature_detected!("cmpxchg16b")
+        && std::arch::is_x86_feature_detected!("fma")
+        && std::arch::is_x86_feature_detected!("gfni")
+        && std::arch::is_x86_feature_detected!("lzcnt")
+        && std::arch::is_x86_feature_detected!("movbe")
+        && std::arch::is_x86_feature_detected!("pclmulqdq")
+        && std::arch::is_x86_feature_detected!("popcnt")
+        && std::arch::is_x86_feature_detected!("rdrand")
+        && std::arch::is_x86_feature_detected!("rdseed")
+        && std::arch::is_x86_feature_detected!("sha")
+        && std::arch::is_x86_feature_detected!("vaes")
+        && std::arch::is_x86_feature_detected!("vpclmulqdq")
+        && std::arch::is_x86_feature_detected!("xsave")
+        && std::arch::is_x86_feature_detected!("xsavec")
+        && std::arch::is_x86_feature_detected!("xsaveopt")
+        && std::arch::is_x86_feature_detected!("xsaves")
+}
+
 // Ensure that we can cast between generic native-width vectors
 #[expect(dead_code, reason = "Compile only test")]
 fn generic_cast<S: Simd>(x: S::f32s) -> S::u32s {
@@ -45,7 +80,7 @@ fn supports_highest_level() {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         assert!(
             level.as_avx2().is_some(),
-            "This machine does not support every `Level` supported by Fearless SIMD (currently AVX2 and below).\n{UNSUPPORTED_LEVEL_MESSAGE}",
+            "This machine does not support every routinely local-tested x86 `Level` supported by Fearless SIMD (currently AVX2 and below; AVX-512 is covered by the SDE CI job).\n{UNSUPPORTED_LEVEL_MESSAGE}",
         );
 
         #[cfg(target_arch = "aarch64")]
@@ -62,6 +97,91 @@ fn supports_highest_level() {
     );
 }
 
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[test]
+fn detects_avx512_when_available() {
+    if !x86_detects_icelake_avx512() {
+        return;
+    }
+
+    let level = Level::new();
+    assert!(
+        level.as_avx512().is_some(),
+        "Ice Lake AVX-512 should be selected when all required features are available"
+    );
+    assert!(
+        level.as_avx2().is_some(),
+        "AVX-512 should downgrade to an AVX2 proof"
+    );
+    assert!(
+        level.as_sse4_2().is_some(),
+        "AVX-512 should downgrade to an SSE4.2 proof"
+    );
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[test]
+fn avx512_masks_are_compact() {
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+    use std::mem::size_of;
+
+    type A = Avx512;
+
+    assert_eq!(size_of::<mask8x16<A>>(), size_of::<__mmask16>());
+    assert_eq!(size_of::<mask16x8<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask32x4<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask64x2<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask8x32<A>>(), size_of::<__mmask32>());
+    assert_eq!(size_of::<mask16x16<A>>(), size_of::<__mmask16>());
+    assert_eq!(size_of::<mask32x8<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask64x4<A>>(), size_of::<__mmask8>());
+    assert_eq!(size_of::<mask8x64<A>>(), size_of::<__mmask64>());
+    assert_eq!(size_of::<mask16x32<A>>(), size_of::<__mmask32>());
+    assert_eq!(size_of::<mask32x16<A>>(), size_of::<__mmask16>());
+    assert_eq!(size_of::<mask64x8<A>>(), size_of::<__mmask8>());
+}
+
+#[simd_test]
+fn x86_mask_arch_conversions_roundtrip<S: Simd>(simd: S) {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::*;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::*;
+
+        macro_rules! assert_roundtrip {
+            ($mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{
+                let bits: u64 = $bits;
+                let expected: [$lane; $lanes] =
+                    core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { -1 } else { 0 });
+
+                let mask = $mask::from_bitmask(simd, bits);
+                let arch: $arch = mask.into();
+                let lanes: [$lane; $lanes] = unsafe { core::mem::transmute_copy(&arch) };
+                assert_eq!(lanes, expected);
+
+                let arch: $arch = unsafe { core::mem::transmute_copy(&expected) };
+                let mask = $mask::simd_from(simd, arch);
+                assert_eq!(mask.to_bitmask(), bits);
+            }};
+        }
+
+        assert_roundtrip!(mask8x16, __m128i, i8, 16, 0xa55a);
+        assert_roundtrip!(mask16x8, __m128i, i16, 8, 0xa5);
+        assert_roundtrip!(mask32x4, __m128i, i32, 4, 0xb);
+        assert_roundtrip!(mask64x2, __m128i, i64, 2, 0x2);
+
+        assert_roundtrip!(mask8x32, __m256i, i8, 32, 0xa55a_5aa5);
+        assert_roundtrip!(mask16x16, __m256i, i16, 16, 0x5aa5);
+        assert_roundtrip!(mask32x8, __m256i, i32, 8, 0xa5);
+        assert_roundtrip!(mask64x4, __m256i, i64, 4, 0xb);
+    }
+}
+
 #[simd_test]
 #[ignore]
 fn test_f32_to_i32_precise_exhaustive<S: Simd>(simd: S) {

From f08f7e6dcd3ac5ddaaa8d6f035dc7ab38be46db0 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 18:22:22 +0100
Subject: [PATCH 02/55] Add checked_transmute_copy and ban transmute_copy to
 statically prevent the spooky bug I almost introduced

---
 .clippy.toml                          |   5 +
 Cargo.toml                            |   1 +
 fearless_simd/src/generated/avx2.rs   | 168 ++++++++--------
 fearless_simd/src/generated/neon.rs   | 264 +++++++++++++-------------
 fearless_simd/src/generated/sse4_2.rs | 168 ++++++++--------
 fearless_simd/src/generated/wasm.rs   | 168 ++++++++--------
 fearless_simd/src/support.rs          |  25 +++
 fearless_simd_gen/src/generic.rs      |  14 +-
 fearless_simd_gen/src/level.rs        |   4 +-
 fearless_simd_gen/src/mk_x86.rs       |   5 +-
 fearless_simd_tests/tests/mod.rs      |   4 +-
 11 files changed, 429 insertions(+), 397 deletions(-)

diff --git a/.clippy.toml b/.clippy.toml
index 4781d68cb..ea0a2fd43 100644
--- a/.clippy.toml
+++ b/.clippy.toml
@@ -7,4 +7,9 @@
 # 16 bytes is the number of bytes that fits into two 64-bit CPU registers.
 trivial-copy-size-limit = 16
 
+disallowed-methods = [
+    { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
+    { path = "std::mem::transmute_copy", reason = "Use a checked wrapper so equal sizes are asserted at compile time." },
+]
+
 # END LINEBENDER LINT SET
diff --git a/Cargo.toml b/Cargo.toml
index 615ede613..8721b67e4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,6 +44,7 @@ clippy.collection_is_never_read = "warn"
 clippy.default_trait_access = "warn"
 clippy.dbg_macro = "warn"
 clippy.debug_assert_with_mut_call = "warn"
+clippy.disallowed_methods = "deny"
 clippy.doc_markdown = "warn"
 clippy.fn_to_numeric_cast_any = "warn"
 clippy.infinite_loop = "warn"
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 49b609b6b..2c2dfa5aa 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -103,14 +103,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -413,14 +413,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -647,14 +647,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -890,7 +890,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -974,14 +974,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1183,14 +1183,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1401,7 +1401,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -1485,14 +1485,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1696,14 +1696,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1922,7 +1922,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -2001,14 +2001,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2243,7 +2243,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -2322,14 +2322,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2695,14 +2695,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3012,14 +3012,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3343,7 +3343,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -3444,14 +3444,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3742,14 +3742,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4061,7 +4061,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4160,14 +4160,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4432,14 +4432,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4722,7 +4722,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4813,14 +4813,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5118,7 +5118,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -5210,14 +5210,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5630,14 +5630,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5912,14 +5912,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6239,7 +6239,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -6376,14 +6376,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6667,14 +6667,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7021,7 +7021,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7135,14 +7135,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7422,14 +7422,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7741,7 +7741,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7863,14 +7863,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -8191,7 +8191,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -8309,7 +8309,7 @@ impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8317,14 +8317,14 @@ impl<S: Simd> SimdFrom<__m256, S> for f32x8<S> {
 impl<S: Simd> From<f32x8<S>> for __m256 {
     #[inline(always)]
     fn from(value: f32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for i8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8332,14 +8332,14 @@ impl<S: Simd> SimdFrom<__m256i, S> for i8x32<S> {
 impl<S: Simd> From<i8x32<S>> for __m256i {
     #[inline(always)]
     fn from(value: i8x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for u8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8347,13 +8347,13 @@ impl<S: Simd> SimdFrom<__m256i, S> for u8x32<S> {
 impl<S: Simd> From<u8x32<S>> for __m256i {
     #[inline(always)]
     fn from(value: u8x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for mask8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        let lanes: [i8; 32usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i8; 32usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8361,14 +8361,14 @@ impl<S: Simd> From<mask8x32<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask8x32<S>) -> Self {
         let lanes: [i8; 32usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for i16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8376,14 +8376,14 @@ impl<S: Simd> SimdFrom<__m256i, S> for i16x16<S> {
 impl<S: Simd> From<i16x16<S>> for __m256i {
     #[inline(always)]
     fn from(value: i16x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for u16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8391,13 +8391,13 @@ impl<S: Simd> SimdFrom<__m256i, S> for u16x16<S> {
 impl<S: Simd> From<u16x16<S>> for __m256i {
     #[inline(always)]
     fn from(value: u16x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for mask16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        let lanes: [i16; 16usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i16; 16usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8405,14 +8405,14 @@ impl<S: Simd> From<mask16x16<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask16x16<S>) -> Self {
         let lanes: [i16; 16usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for i32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8420,14 +8420,14 @@ impl<S: Simd> SimdFrom<__m256i, S> for i32x8<S> {
 impl<S: Simd> From<i32x8<S>> for __m256i {
     #[inline(always)]
     fn from(value: i32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for u32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8435,13 +8435,13 @@ impl<S: Simd> SimdFrom<__m256i, S> for u32x8<S> {
 impl<S: Simd> From<u32x8<S>> for __m256i {
     #[inline(always)]
     fn from(value: u32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for mask32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        let lanes: [i32; 8usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i32; 8usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8449,14 +8449,14 @@ impl<S: Simd> From<mask32x8<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask32x8<S>) -> Self {
         let lanes: [i32; 8usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m256d, S> for f64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256d) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8464,13 +8464,13 @@ impl<S: Simd> SimdFrom<__m256d, S> for f64x4<S> {
 impl<S: Simd> From<f64x4<S>> for __m256d {
     #[inline(always)]
     fn from(value: f64x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m256i, S> for mask64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
-        let lanes: [i64; 4usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i64; 4usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8478,7 +8478,7 @@ impl<S: Simd> From<mask64x4<S>> for __m256i {
     #[inline(always)]
     fn from(value: mask64x4<S>) -> Self {
         let lanes: [i64; 4usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index fe02e32d6..ca5486cbc 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -93,14 +93,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -358,14 +358,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -568,14 +568,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -781,7 +781,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -874,14 +874,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1084,14 +1084,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1293,7 +1293,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -1377,14 +1377,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1591,14 +1591,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1800,7 +1800,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -1884,14 +1884,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2124,7 +2124,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -2209,14 +2209,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2613,14 +2613,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2924,14 +2924,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3230,7 +3230,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -3345,14 +3345,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3656,14 +3656,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3971,7 +3971,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4086,14 +4086,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4402,14 +4402,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4705,7 +4705,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4820,14 +4820,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5177,7 +5177,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -5292,14 +5292,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5713,14 +5713,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6033,14 +6033,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6351,7 +6351,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -6457,14 +6457,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6786,14 +6786,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7123,7 +7123,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7232,14 +7232,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7557,14 +7557,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7877,7 +7877,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7983,14 +7983,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -8349,7 +8349,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -8452,7 +8452,7 @@ impl<S: Simd> SimdFrom<float32x4_t, S> for f32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: float32x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8460,14 +8460,14 @@ impl<S: Simd> SimdFrom<float32x4_t, S> for f32x4<S> {
 impl<S: Simd> From<f32x4<S>> for float32x4_t {
     #[inline(always)]
     fn from(value: f32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int8x16_t, S> for i8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int8x16_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8475,14 +8475,14 @@ impl<S: Simd> SimdFrom<int8x16_t, S> for i8x16<S> {
 impl<S: Simd> From<i8x16<S>> for int8x16_t {
     #[inline(always)]
     fn from(value: i8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint8x16_t, S> for u8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint8x16_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8490,14 +8490,14 @@ impl<S: Simd> SimdFrom<uint8x16_t, S> for u8x16<S> {
 impl<S: Simd> From<u8x16<S>> for uint8x16_t {
     #[inline(always)]
     fn from(value: u8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int8x16_t, S> for mask8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int8x16_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8505,14 +8505,14 @@ impl<S: Simd> SimdFrom<int8x16_t, S> for mask8x16<S> {
 impl<S: Simd> From<mask8x16<S>> for int8x16_t {
     #[inline(always)]
     fn from(value: mask8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int16x8_t, S> for i16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int16x8_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8520,14 +8520,14 @@ impl<S: Simd> SimdFrom<int16x8_t, S> for i16x8<S> {
 impl<S: Simd> From<i16x8<S>> for int16x8_t {
     #[inline(always)]
     fn from(value: i16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint16x8_t, S> for u16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint16x8_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8535,14 +8535,14 @@ impl<S: Simd> SimdFrom<uint16x8_t, S> for u16x8<S> {
 impl<S: Simd> From<u16x8<S>> for uint16x8_t {
     #[inline(always)]
     fn from(value: u16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int16x8_t, S> for mask16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int16x8_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8550,14 +8550,14 @@ impl<S: Simd> SimdFrom<int16x8_t, S> for mask16x8<S> {
 impl<S: Simd> From<mask16x8<S>> for int16x8_t {
     #[inline(always)]
     fn from(value: mask16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int32x4_t, S> for i32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int32x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8565,14 +8565,14 @@ impl<S: Simd> SimdFrom<int32x4_t, S> for i32x4<S> {
 impl<S: Simd> From<i32x4<S>> for int32x4_t {
     #[inline(always)]
     fn from(value: i32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint32x4_t, S> for u32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint32x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8580,14 +8580,14 @@ impl<S: Simd> SimdFrom<uint32x4_t, S> for u32x4<S> {
 impl<S: Simd> From<u32x4<S>> for uint32x4_t {
     #[inline(always)]
     fn from(value: u32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int32x4_t, S> for mask32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int32x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8595,14 +8595,14 @@ impl<S: Simd> SimdFrom<int32x4_t, S> for mask32x4<S> {
 impl<S: Simd> From<mask32x4<S>> for int32x4_t {
     #[inline(always)]
     fn from(value: mask32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<float64x2_t, S> for f64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: float64x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8610,14 +8610,14 @@ impl<S: Simd> SimdFrom<float64x2_t, S> for f64x2<S> {
 impl<S: Simd> From<f64x2<S>> for float64x2_t {
     #[inline(always)]
     fn from(value: f64x2<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int64x2_t, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int64x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8625,14 +8625,14 @@ impl<S: Simd> SimdFrom<int64x2_t, S> for mask64x2<S> {
 impl<S: Simd> From<mask64x2<S>> for int64x2_t {
     #[inline(always)]
     fn from(value: mask64x2<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<float32x4x2_t, S> for f32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: float32x4x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8640,14 +8640,14 @@ impl<S: Simd> SimdFrom<float32x4x2_t, S> for f32x8<S> {
 impl<S: Simd> From<f32x8<S>> for float32x4x2_t {
     #[inline(always)]
     fn from(value: f32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int8x16x2_t, S> for i8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int8x16x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8655,14 +8655,14 @@ impl<S: Simd> SimdFrom<int8x16x2_t, S> for i8x32<S> {
 impl<S: Simd> From<i8x32<S>> for int8x16x2_t {
     #[inline(always)]
     fn from(value: i8x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint8x16x2_t, S> for u8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint8x16x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8670,14 +8670,14 @@ impl<S: Simd> SimdFrom<uint8x16x2_t, S> for u8x32<S> {
 impl<S: Simd> From<u8x32<S>> for uint8x16x2_t {
     #[inline(always)]
     fn from(value: u8x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int8x16x2_t, S> for mask8x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int8x16x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8685,14 +8685,14 @@ impl<S: Simd> SimdFrom<int8x16x2_t, S> for mask8x32<S> {
 impl<S: Simd> From<mask8x32<S>> for int8x16x2_t {
     #[inline(always)]
     fn from(value: mask8x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int16x8x2_t, S> for i16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int16x8x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8700,14 +8700,14 @@ impl<S: Simd> SimdFrom<int16x8x2_t, S> for i16x16<S> {
 impl<S: Simd> From<i16x16<S>> for int16x8x2_t {
     #[inline(always)]
     fn from(value: i16x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint16x8x2_t, S> for u16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint16x8x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8715,14 +8715,14 @@ impl<S: Simd> SimdFrom<uint16x8x2_t, S> for u16x16<S> {
 impl<S: Simd> From<u16x16<S>> for uint16x8x2_t {
     #[inline(always)]
     fn from(value: u16x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int16x8x2_t, S> for mask16x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int16x8x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8730,14 +8730,14 @@ impl<S: Simd> SimdFrom<int16x8x2_t, S> for mask16x16<S> {
 impl<S: Simd> From<mask16x16<S>> for int16x8x2_t {
     #[inline(always)]
     fn from(value: mask16x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int32x4x2_t, S> for i32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int32x4x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8745,14 +8745,14 @@ impl<S: Simd> SimdFrom<int32x4x2_t, S> for i32x8<S> {
 impl<S: Simd> From<i32x8<S>> for int32x4x2_t {
     #[inline(always)]
     fn from(value: i32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint32x4x2_t, S> for u32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint32x4x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8760,14 +8760,14 @@ impl<S: Simd> SimdFrom<uint32x4x2_t, S> for u32x8<S> {
 impl<S: Simd> From<u32x8<S>> for uint32x4x2_t {
     #[inline(always)]
     fn from(value: u32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int32x4x2_t, S> for mask32x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int32x4x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8775,14 +8775,14 @@ impl<S: Simd> SimdFrom<int32x4x2_t, S> for mask32x8<S> {
 impl<S: Simd> From<mask32x8<S>> for int32x4x2_t {
     #[inline(always)]
     fn from(value: mask32x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<float64x2x2_t, S> for f64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: float64x2x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8790,14 +8790,14 @@ impl<S: Simd> SimdFrom<float64x2x2_t, S> for f64x4<S> {
 impl<S: Simd> From<f64x4<S>> for float64x2x2_t {
     #[inline(always)]
     fn from(value: f64x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int64x2x2_t, S> for mask64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int64x2x2_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8805,14 +8805,14 @@ impl<S: Simd> SimdFrom<int64x2x2_t, S> for mask64x4<S> {
 impl<S: Simd> From<mask64x4<S>> for int64x2x2_t {
     #[inline(always)]
     fn from(value: mask64x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<float32x4x4_t, S> for f32x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: float32x4x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8820,14 +8820,14 @@ impl<S: Simd> SimdFrom<float32x4x4_t, S> for f32x16<S> {
 impl<S: Simd> From<f32x16<S>> for float32x4x4_t {
     #[inline(always)]
     fn from(value: f32x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int8x16x4_t, S> for i8x64<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int8x16x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8835,14 +8835,14 @@ impl<S: Simd> SimdFrom<int8x16x4_t, S> for i8x64<S> {
 impl<S: Simd> From<i8x64<S>> for int8x16x4_t {
     #[inline(always)]
     fn from(value: i8x64<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint8x16x4_t, S> for u8x64<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint8x16x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8850,14 +8850,14 @@ impl<S: Simd> SimdFrom<uint8x16x4_t, S> for u8x64<S> {
 impl<S: Simd> From<u8x64<S>> for uint8x16x4_t {
     #[inline(always)]
     fn from(value: u8x64<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int8x16x4_t, S> for mask8x64<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int8x16x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8865,14 +8865,14 @@ impl<S: Simd> SimdFrom<int8x16x4_t, S> for mask8x64<S> {
 impl<S: Simd> From<mask8x64<S>> for int8x16x4_t {
     #[inline(always)]
     fn from(value: mask8x64<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int16x8x4_t, S> for i16x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int16x8x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8880,14 +8880,14 @@ impl<S: Simd> SimdFrom<int16x8x4_t, S> for i16x32<S> {
 impl<S: Simd> From<i16x32<S>> for int16x8x4_t {
     #[inline(always)]
     fn from(value: i16x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint16x8x4_t, S> for u16x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint16x8x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8895,14 +8895,14 @@ impl<S: Simd> SimdFrom<uint16x8x4_t, S> for u16x32<S> {
 impl<S: Simd> From<u16x32<S>> for uint16x8x4_t {
     #[inline(always)]
     fn from(value: u16x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int16x8x4_t, S> for mask16x32<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int16x8x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8910,14 +8910,14 @@ impl<S: Simd> SimdFrom<int16x8x4_t, S> for mask16x32<S> {
 impl<S: Simd> From<mask16x32<S>> for int16x8x4_t {
     #[inline(always)]
     fn from(value: mask16x32<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int32x4x4_t, S> for i32x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int32x4x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8925,14 +8925,14 @@ impl<S: Simd> SimdFrom<int32x4x4_t, S> for i32x16<S> {
 impl<S: Simd> From<i32x16<S>> for int32x4x4_t {
     #[inline(always)]
     fn from(value: i32x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<uint32x4x4_t, S> for u32x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: uint32x4x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8940,14 +8940,14 @@ impl<S: Simd> SimdFrom<uint32x4x4_t, S> for u32x16<S> {
 impl<S: Simd> From<u32x16<S>> for uint32x4x4_t {
     #[inline(always)]
     fn from(value: u32x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int32x4x4_t, S> for mask32x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int32x4x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8955,14 +8955,14 @@ impl<S: Simd> SimdFrom<int32x4x4_t, S> for mask32x16<S> {
 impl<S: Simd> From<mask32x16<S>> for int32x4x4_t {
     #[inline(always)]
     fn from(value: mask32x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<float64x2x4_t, S> for f64x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: float64x2x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8970,14 +8970,14 @@ impl<S: Simd> SimdFrom<float64x2x4_t, S> for f64x8<S> {
 impl<S: Simd> From<f64x8<S>> for float64x2x4_t {
     #[inline(always)]
     fn from(value: f64x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<int64x2x4_t, S> for mask64x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int64x2x4_t) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8985,7 +8985,7 @@ impl<S: Simd> SimdFrom<int64x2x4_t, S> for mask64x8<S> {
 impl<S: Simd> From<mask64x8<S>> for int64x2x4_t {
     #[inline(always)]
     fn from(value: mask64x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 #[doc = r" This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still"]
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index d55aa6a44..a2d90513e 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -129,14 +129,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -442,14 +442,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -679,14 +679,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -930,7 +930,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -1017,14 +1017,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1229,14 +1229,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1450,7 +1450,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -1537,14 +1537,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1751,14 +1751,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1980,7 +1980,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -2062,14 +2062,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2307,7 +2307,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -2390,14 +2390,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2772,14 +2772,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3061,14 +3061,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3345,7 +3345,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -3458,14 +3458,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3747,14 +3747,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4042,7 +4042,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4157,14 +4157,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4451,14 +4451,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4732,7 +4732,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4845,14 +4845,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5180,7 +5180,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -5293,14 +5293,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5713,14 +5713,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5995,14 +5995,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6322,7 +6322,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -6465,14 +6465,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6756,14 +6756,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7102,7 +7102,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7216,14 +7216,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7503,14 +7503,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7822,7 +7822,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7928,14 +7928,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -8256,7 +8256,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -8359,7 +8359,7 @@ impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8367,14 +8367,14 @@ impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
 impl<S: Simd> From<f32x4<S>> for __m128 {
     #[inline(always)]
     fn from(value: f32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8382,14 +8382,14 @@ impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
 impl<S: Simd> From<i8x16<S>> for __m128i {
     #[inline(always)]
     fn from(value: i8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8397,13 +8397,13 @@ impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
 impl<S: Simd> From<u8x16<S>> for __m128i {
     #[inline(always)]
     fn from(value: u8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        let lanes: [i8; 16usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i8; 16usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8411,14 +8411,14 @@ impl<S: Simd> From<mask8x16<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask8x16<S>) -> Self {
         let lanes: [i8; 16usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8426,14 +8426,14 @@ impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
 impl<S: Simd> From<i16x8<S>> for __m128i {
     #[inline(always)]
     fn from(value: i16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8441,13 +8441,13 @@ impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
 impl<S: Simd> From<u16x8<S>> for __m128i {
     #[inline(always)]
     fn from(value: u16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        let lanes: [i16; 8usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i16; 8usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8455,14 +8455,14 @@ impl<S: Simd> From<mask16x8<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask16x8<S>) -> Self {
         let lanes: [i16; 8usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8470,14 +8470,14 @@ impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
 impl<S: Simd> From<i32x4<S>> for __m128i {
     #[inline(always)]
     fn from(value: i32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8485,13 +8485,13 @@ impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
 impl<S: Simd> From<u32x4<S>> for __m128i {
     #[inline(always)]
     fn from(value: u32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        let lanes: [i32; 4usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i32; 4usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8499,14 +8499,14 @@ impl<S: Simd> From<mask32x4<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask32x4<S>) -> Self {
         let lanes: [i32; 4usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128d) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8514,13 +8514,13 @@ impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
 impl<S: Simd> From<f64x2<S>> for __m128d {
     #[inline(always)]
     fn from(value: f64x2<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
-        let lanes: [i64; 2usize] = unsafe { core::mem::transmute_copy(&arch) };
+        let lanes: [i64; 2usize] = unsafe { crate::support::checked_transmute_copy(&arch) };
         lanes.simd_into(simd)
     }
 }
@@ -8528,7 +8528,7 @@ impl<S: Simd> From<mask64x2<S>> for __m128i {
     #[inline(always)]
     fn from(value: mask64x2<S>) -> Self {
         let lanes: [i64; 2usize] = value.into();
-        unsafe { core::mem::transmute_copy(&lanes) }
+        unsafe { crate::support::checked_transmute_copy(&lanes) }
     }
 }
 #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 004afa03f..faeffed9e 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -92,14 +92,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -398,14 +398,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -623,14 +623,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -847,7 +847,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -934,14 +934,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1143,14 +1143,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1349,7 +1349,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -1434,14 +1434,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1647,14 +1647,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -1853,7 +1853,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -1938,14 +1938,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2203,7 +2203,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -2289,14 +2289,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2671,14 +2671,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -2960,14 +2960,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3244,7 +3244,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -3357,14 +3357,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3646,14 +3646,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -3939,7 +3939,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4052,14 +4052,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4346,14 +4346,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -4627,7 +4627,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -4740,14 +4740,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5075,7 +5075,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -5188,14 +5188,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5605,14 +5605,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -5887,14 +5887,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6225,7 +6225,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -6331,14 +6331,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6622,14 +6622,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -6955,7 +6955,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7064,14 +7064,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7351,14 +7351,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -7667,7 +7667,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -7773,14 +7773,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: unsafe { core::mem::transmute_copy(val) },
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
             simd: self,
         }
     }
@@ -8101,7 +8101,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: unsafe { core::mem::transmute_copy(&val) },
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
             simd: self,
         }
     }
@@ -8204,7 +8204,7 @@ impl<S: Simd> SimdFrom<v128, S> for f32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8212,14 +8212,14 @@ impl<S: Simd> SimdFrom<v128, S> for f32x4<S> {
 impl<S: Simd> From<f32x4<S>> for v128 {
     #[inline(always)]
     fn from(value: f32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for i8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8227,14 +8227,14 @@ impl<S: Simd> SimdFrom<v128, S> for i8x16<S> {
 impl<S: Simd> From<i8x16<S>> for v128 {
     #[inline(always)]
     fn from(value: i8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for u8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8242,14 +8242,14 @@ impl<S: Simd> SimdFrom<v128, S> for u8x16<S> {
 impl<S: Simd> From<u8x16<S>> for v128 {
     #[inline(always)]
     fn from(value: u8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for mask8x16<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8257,14 +8257,14 @@ impl<S: Simd> SimdFrom<v128, S> for mask8x16<S> {
 impl<S: Simd> From<mask8x16<S>> for v128 {
     #[inline(always)]
     fn from(value: mask8x16<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for i16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8272,14 +8272,14 @@ impl<S: Simd> SimdFrom<v128, S> for i16x8<S> {
 impl<S: Simd> From<i16x8<S>> for v128 {
     #[inline(always)]
     fn from(value: i16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for u16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8287,14 +8287,14 @@ impl<S: Simd> SimdFrom<v128, S> for u16x8<S> {
 impl<S: Simd> From<u16x8<S>> for v128 {
     #[inline(always)]
     fn from(value: u16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for mask16x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8302,14 +8302,14 @@ impl<S: Simd> SimdFrom<v128, S> for mask16x8<S> {
 impl<S: Simd> From<mask16x8<S>> for v128 {
     #[inline(always)]
     fn from(value: mask16x8<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for i32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8317,14 +8317,14 @@ impl<S: Simd> SimdFrom<v128, S> for i32x4<S> {
 impl<S: Simd> From<i32x4<S>> for v128 {
     #[inline(always)]
     fn from(value: i32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for u32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8332,14 +8332,14 @@ impl<S: Simd> SimdFrom<v128, S> for u32x4<S> {
 impl<S: Simd> From<u32x4<S>> for v128 {
     #[inline(always)]
     fn from(value: u32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for mask32x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8347,14 +8347,14 @@ impl<S: Simd> SimdFrom<v128, S> for mask32x4<S> {
 impl<S: Simd> From<mask32x4<S>> for v128 {
     #[inline(always)]
     fn from(value: mask32x4<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for f64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8362,14 +8362,14 @@ impl<S: Simd> SimdFrom<v128, S> for f64x2<S> {
 impl<S: Simd> From<f64x2<S>> for v128 {
     #[inline(always)]
     fn from(value: f64x2<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 impl<S: Simd> SimdFrom<v128, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
         Self {
-            val: unsafe { core::mem::transmute_copy(&arch) },
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
             simd,
         }
     }
@@ -8377,7 +8377,7 @@ impl<S: Simd> SimdFrom<v128, S> for mask64x2<S> {
 impl<S: Simd> From<mask64x2<S>> for v128 {
     #[inline(always)]
     fn from(value: mask64x2<S>) -> Self {
-        unsafe { core::mem::transmute_copy(&value.val) }
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
     }
 }
 #[doc = r" This is a vector extend, like `vext` on ARM or `alignr` on x86, that takes a non-const shift argument."]
diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs
index 2c298326f..fce929808 100644
--- a/fearless_simd/src/support.rs
+++ b/fearless_simd/src/support.rs
@@ -1,6 +1,8 @@
 // Copyright 2025 the Fearless_SIMD Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
+use core::mem::size_of;
+
 #[derive(Clone, Copy, Debug)]
 #[repr(C, align(16))]
 #[expect(
@@ -28,6 +30,29 @@ pub struct Aligned256<T>(pub T);
 /// Wrapper for internal native vector types that gives them 512-bit alignment.
 pub struct Aligned512<T>(pub T);
 
+/// Like [`core::mem::transmute_copy`], but statically rejects differently-sized types.
+///
+/// # Safety
+///
+/// `src` must be valid to copy as `Dst`. This helper only checks the size invariant; the caller
+/// is still responsible for the rest of `transmute_copy`'s safety contract.
+#[inline(always)]
+#[allow(
+    clippy::disallowed_methods,
+    reason = "This is the central checked wrapper around transmute_copy"
+)]
+pub(crate) unsafe fn checked_transmute_copy<Src, Dst>(src: &Src) -> Dst {
+    const {
+        assert!(
+            size_of::<Src>() == size_of::<Dst>(),
+            "checked_transmute_copy requires source and destination to have the same size"
+        );
+    }
+    // Safety: The caller upholds `transmute_copy`'s validity requirements, and the
+    // const assertion above prevents the "destination larger than source" footgun.
+    unsafe { core::mem::transmute_copy(src) }
+}
+
 /// The actual `Debug` implementation for all `SimdBase` types. This only needs to be monomorphized once per element
 /// type, rather than once per vector type.
 #[inline(never)]
diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index e2d2cfeef..233ad6ffa 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -371,25 +371,25 @@ pub(crate) fn generic_from_array(
     } else {
         quote! { val }
     };
-
     // There are architecture-specific "load" intrinsics, but they can actually be *worse* for performance. If they
     // lower to LLVM intrinsics, they will likely not be optimized until much later in the pipeline (if at all),
     // resulting in substantially worse codegen. See https://github.com/linebender/fearless_simd/pull/185.
-    let expr = quote! {
+    let expr = quote! {{
         // Safety: The native vector type backing any implementation will be:
         // - A `#[repr(simd)]` type, which has the same layout as an array of scalars
         // - An array of `#[repr(simd)]` types
         // - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types
         //
-        // These all have the same layout as a flat array of the corresponding scalars. The native vector types probably
-        // have greater alignment requirements than the source array type we're copying from, but that's explicitly
-        // allowed by transmute_copy:
+        // These all have the same layout as a flat array of the corresponding scalars. `checked_transmute_copy`
+        // statically verifies that the source and destination sizes match. The native vector types probably have
+        // greater alignment requirements than the source array type we're copying from, but that's explicitly allowed by
+        // transmute_copy:
         //
         // > This function will unsafely assume the pointer src is valid for size_of::<Dst> bytes by transmuting &Src to
         // > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has
         // > stricter alignment requirements than &Src).**
-        unsafe { core::mem::transmute_copy(#inner_ref) }
-    };
+        unsafe { crate::support::checked_transmute_copy(#inner_ref) }
+    }};
     let vec_rust = vec_ty.rust();
 
     quote! {
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index 8022eb0e4..0a5d2735a 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -237,7 +237,7 @@ pub(crate) trait Level {
                     #[inline(always)]
                     fn simd_from(simd: S, arch: #arch) -> Self {
                         Self {
-                            val: unsafe { core::mem::transmute_copy(&arch) },
+                            val: unsafe { crate::support::checked_transmute_copy(&arch) },
                             simd
                         }
                     }
@@ -245,7 +245,7 @@ pub(crate) trait Level {
                 impl<S: Simd> From<#simd<S>> for #arch {
                     #[inline(always)]
                     fn from(value: #simd<S>) -> Self {
-                        unsafe { core::mem::transmute_copy(&value.val) }
+                        unsafe { crate::support::checked_transmute_copy(&value.val) }
                     }
                 }
                 }
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 420e8fcb7..3c35b249e 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -186,7 +186,8 @@ impl Level for X86 {
             impl<S: Simd> SimdFrom<#arch, S> for #simd<S> {
                 #[inline(always)]
                 fn simd_from(simd: S, arch: #arch) -> Self {
-                    let lanes: [#lane_ty; #len] = unsafe { core::mem::transmute_copy(&arch) };
+                    let lanes: [#lane_ty; #len] =
+                        unsafe { crate::support::checked_transmute_copy(&arch) };
                     lanes.simd_into(simd)
                 }
             }
@@ -194,7 +195,7 @@ impl Level for X86 {
                 #[inline(always)]
                 fn from(value: #simd<S>) -> Self {
                     let lanes: [#lane_ty; #len] = value.into();
-                    unsafe { core::mem::transmute_copy(&lanes) }
+                    unsafe { crate::support::checked_transmute_copy(&lanes) }
                 }
             }
         })
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index f2c39ada3..09b597aab 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -161,10 +161,10 @@ fn x86_mask_arch_conversions_roundtrip<S: Simd>(simd: S) {
 
                 let mask = $mask::from_bitmask(simd, bits);
                 let arch: $arch = mask.into();
-                let lanes: [$lane; $lanes] = unsafe { core::mem::transmute_copy(&arch) };
+                let lanes: [$lane; $lanes] = unsafe { core::mem::transmute(arch) };
                 assert_eq!(lanes, expected);
 
-                let arch: $arch = unsafe { core::mem::transmute_copy(&expected) };
+                let arch: $arch = unsafe { core::mem::transmute(expected) };
                 let mask = $mask::simd_from(simd, arch);
                 assert_eq!(mask.to_bitmask(), bits);
             }};

From aef1cac2692d1e81ec01f6dd9321949da8f05106 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 18:34:57 +0100
Subject: [PATCH 03/55] Expand native type conversion test coverage

---
 .../tests/harness/lm_generated.rs             |   2 +
 .../lm_generated/mask_roundtrip_x86.rs        | 240 ++++++++++++++++++
 fearless_simd_tests/tests/mod.rs              |  38 ---
 3 files changed, 242 insertions(+), 38 deletions(-)
 create mode 100644 fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs

diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs
index 789a8eb99..3e30f814e 100644
--- a/fearless_simd_tests/tests/harness/lm_generated.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated.rs
@@ -3,5 +3,7 @@
 
 mod extended_512;
 mod mask_methods;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod mask_roundtrip_x86;
 mod mod_256;
 mod mod_512;
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
new file mode 100644
index 000000000..385a516cd
--- /dev/null
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
@@ -0,0 +1,240 @@
+// Copyright 2026 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+use fearless_simd::*;
+use fearless_simd_dev_macros::simd_test;
+
+const INTERESTING_32: &[u64] = &[
+    0x0000_0000,
+    0x0000_0001,
+    0x8000_0000,
+    0x0000_ffff,
+    0xffff_0000,
+    0x5555_5555,
+    0xaaaa_aaaa,
+    0x8000_aa55,
+    0xffff_ffff,
+    0xffff_ffff_0000_0000,
+    0xffff_ffff_8000_aa55,
+    0xffff_ffff_ffff_ffff,
+];
+
+const INTERESTING_64: &[u64] = &[
+    0x0000_0000_0000_0000,
+    0x0000_0000_0000_0001,
+    0x8000_0000_0000_0000,
+    0x0000_0000_ffff_ffff,
+    0xffff_ffff_0000_0000,
+    0x5555_5555_5555_5555,
+    0xaaaa_aaaa_aaaa_aaaa,
+    0x8000_0001_5555_aaab,
+    0xffff_ffff_ffff_ffff,
+];
+
+fn lane_mask(lanes: usize) -> u64 {
+    if lanes == u64::BITS as usize {
+        u64::MAX
+    } else {
+        (1_u64 << lanes) - 1
+    }
+}
+
+trait MaskArch: Copy + Eq + core::fmt::Debug {
+    fn from_bits(bits: u64) -> Self;
+}
+
+impl MaskArch for u8 {
+    fn from_bits(bits: u64) -> Self {
+        Self::try_from(bits).expect("masked bits fit in __mmask8")
+    }
+}
+
+impl MaskArch for u16 {
+    fn from_bits(bits: u64) -> Self {
+        Self::try_from(bits).expect("masked bits fit in __mmask16")
+    }
+}
+
+impl MaskArch for u32 {
+    fn from_bits(bits: u64) -> Self {
+        Self::try_from(bits).expect("masked bits fit in __mmask32")
+    }
+}
+
+impl MaskArch for u64 {
+    fn from_bits(bits: u64) -> Self {
+        bits
+    }
+}
+
+macro_rules! assert_native_vector_roundtrip {
+    ($simd:expr, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{
+        let bits = $bits;
+        let expected_bits = bits & lane_mask($lanes);
+        let expected_lanes: [$lane; $lanes] = core::array::from_fn(|i| {
+            if ((expected_bits >> i) & 1) != 0 {
+                -1
+            } else {
+                0
+            }
+        });
+
+        let mask = $mask::from_bitmask($simd, bits);
+        let arch: $arch = mask.into();
+        // Safety: these x86 vector types have the same size and lane layout as the signed
+        // integer arrays used for mask values.
+        let lanes = unsafe { core::mem::transmute::<$arch, [$lane; $lanes]>(arch) };
+        assert_eq!(
+            lanes,
+            expected_lanes,
+            "{} -> {} lane values for {bits:#018x}",
+            stringify!($mask),
+            stringify!($arch)
+        );
+
+        // Safety: this builds the native x86 vector value from the lane representation expected
+        // by the public mask conversion.
+        let arch = unsafe { core::mem::transmute::<[$lane; $lanes], $arch>(expected_lanes) };
+        let mask = $mask::simd_from($simd, arch);
+        assert_eq!(
+            mask.to_bitmask(),
+            expected_bits,
+            "{} <- {} bitmask for {bits:#018x}",
+            stringify!($mask),
+            stringify!($arch)
+        );
+    }};
+}
+
+macro_rules! assert_native_mask_roundtrip {
+    ($simd:expr, $mask:ident, $arch:ty, $lanes:literal, $bits:expr) => {{
+        let bits = $bits;
+        let expected_bits = bits & lane_mask($lanes);
+        let expected_arch = <$arch as MaskArch>::from_bits(expected_bits);
+
+        let mask = $mask::from_bitmask($simd, bits);
+        let arch: $arch = mask.into();
+        assert_eq!(
+            arch,
+            expected_arch,
+            "{} -> {} for {bits:#018x}",
+            stringify!($mask),
+            stringify!($arch)
+        );
+
+        let mask = $mask::simd_from($simd, expected_arch);
+        assert_eq!(
+            mask.to_bitmask(),
+            expected_bits,
+            "{} <- {} bitmask for {bits:#018x}",
+            stringify!($mask),
+            stringify!($arch)
+        );
+
+        let arch: $arch = mask.into();
+        assert_eq!(
+            arch,
+            expected_arch,
+            "{} -> {} after roundtrip for {bits:#018x}",
+            stringify!($mask),
+            stringify!($arch)
+        );
+    }};
+}
+
+macro_rules! native_vector_roundtrip_exhaustive {
+    ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal) => {
+        #[simd_test]
+        fn $test<S: Simd>(simd: S) {
+            for bits in 0..=0xffff_u64 {
+                assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits);
+            }
+        }
+    };
+}
+
+macro_rules! native_vector_roundtrip_interesting {
+    ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $values:ident) => {
+        #[simd_test]
+        fn $test<S: Simd>(simd: S) {
+            for &bits in $values {
+                assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits);
+            }
+        }
+    };
+}
+
+macro_rules! native_mask_roundtrip_exhaustive {
+    ($test:ident, $mask:ident, $arch:ty, $lanes:literal) => {
+        #[simd_test]
+        fn $test<S: Simd>(simd: S) {
+            for bits in 0..=0xffff_u64 {
+                assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits);
+            }
+        }
+    };
+}
+
+macro_rules! native_mask_roundtrip_interesting {
+    ($test:ident, $mask:ident, $arch:ty, $lanes:literal, $values:ident) => {
+        #[simd_test]
+        fn $test<S: Simd>(simd: S) {
+            for &bits in $values {
+                assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits);
+            }
+        }
+    };
+}
+
+native_vector_roundtrip_exhaustive!(mask8x16_m128i_roundtrip, mask8x16, __m128i, i8, 16);
+native_vector_roundtrip_exhaustive!(mask16x8_m128i_roundtrip, mask16x8, __m128i, i16, 8);
+native_vector_roundtrip_exhaustive!(mask32x4_m128i_roundtrip, mask32x4, __m128i, i32, 4);
+native_vector_roundtrip_exhaustive!(mask64x2_m128i_roundtrip, mask64x2, __m128i, i64, 2);
+
+native_vector_roundtrip_interesting!(
+    mask8x32_m256i_roundtrip,
+    mask8x32,
+    __m256i,
+    i8,
+    32,
+    INTERESTING_32
+);
+native_vector_roundtrip_exhaustive!(mask16x16_m256i_roundtrip, mask16x16, __m256i, i16, 16);
+native_vector_roundtrip_exhaustive!(mask32x8_m256i_roundtrip, mask32x8, __m256i, i32, 8);
+native_vector_roundtrip_exhaustive!(mask64x4_m256i_roundtrip, mask64x4, __m256i, i64, 4);
+
+native_mask_roundtrip_exhaustive!(mask8x16_mmask16_roundtrip, mask8x16, __mmask16, 16);
+native_mask_roundtrip_exhaustive!(mask16x8_mmask8_roundtrip, mask16x8, __mmask8, 8);
+native_mask_roundtrip_exhaustive!(mask32x4_mmask8_roundtrip, mask32x4, __mmask8, 4);
+native_mask_roundtrip_exhaustive!(mask64x2_mmask8_roundtrip, mask64x2, __mmask8, 2);
+native_mask_roundtrip_interesting!(
+    mask8x32_mmask32_roundtrip,
+    mask8x32,
+    __mmask32,
+    32,
+    INTERESTING_32
+);
+native_mask_roundtrip_exhaustive!(mask16x16_mmask16_roundtrip, mask16x16, __mmask16, 16);
+native_mask_roundtrip_exhaustive!(mask32x8_mmask8_roundtrip, mask32x8, __mmask8, 8);
+native_mask_roundtrip_exhaustive!(mask64x4_mmask8_roundtrip, mask64x4, __mmask8, 4);
+native_mask_roundtrip_interesting!(
+    mask8x64_mmask64_roundtrip,
+    mask8x64,
+    __mmask64,
+    64,
+    INTERESTING_64
+);
+native_mask_roundtrip_interesting!(
+    mask16x32_mmask32_roundtrip,
+    mask16x32,
+    __mmask32,
+    32,
+    INTERESTING_32
+);
+native_mask_roundtrip_exhaustive!(mask32x16_mmask16_roundtrip, mask32x16, __mmask16, 16);
+native_mask_roundtrip_exhaustive!(mask64x8_mmask8_roundtrip, mask64x8, __mmask8, 8);
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index 09b597aab..6559ea92d 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -144,44 +144,6 @@ fn avx512_masks_are_compact() {
     assert_eq!(size_of::<mask64x8<A>>(), size_of::<__mmask8>());
 }
 
-#[simd_test]
-fn x86_mask_arch_conversions_roundtrip<S: Simd>(simd: S) {
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    {
-        #[cfg(target_arch = "x86")]
-        use core::arch::x86::*;
-        #[cfg(target_arch = "x86_64")]
-        use core::arch::x86_64::*;
-
-        macro_rules! assert_roundtrip {
-            ($mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{
-                let bits: u64 = $bits;
-                let expected: [$lane; $lanes] =
-                    core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { -1 } else { 0 });
-
-                let mask = $mask::from_bitmask(simd, bits);
-                let arch: $arch = mask.into();
-                let lanes: [$lane; $lanes] = unsafe { core::mem::transmute(arch) };
-                assert_eq!(lanes, expected);
-
-                let arch: $arch = unsafe { core::mem::transmute(expected) };
-                let mask = $mask::simd_from(simd, arch);
-                assert_eq!(mask.to_bitmask(), bits);
-            }};
-        }
-
-        assert_roundtrip!(mask8x16, __m128i, i8, 16, 0xa55a);
-        assert_roundtrip!(mask16x8, __m128i, i16, 8, 0xa5);
-        assert_roundtrip!(mask32x4, __m128i, i32, 4, 0xb);
-        assert_roundtrip!(mask64x2, __m128i, i64, 2, 0x2);
-
-        assert_roundtrip!(mask8x32, __m256i, i8, 32, 0xa55a_5aa5);
-        assert_roundtrip!(mask16x16, __m256i, i16, 16, 0x5aa5);
-        assert_roundtrip!(mask32x8, __m256i, i32, 8, 0xa5);
-        assert_roundtrip!(mask64x4, __m256i, i64, 4, 0xb);
-    }
-}
-
 #[simd_test]
 #[ignore]
 fn test_f32_to_i32_precise_exhaustive<S: Simd>(simd: S) {

From c12a7cc76287714f505b1d259db2e4769da99606 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 18:35:46 +0100
Subject: [PATCH 04/55] Rename test: mask_methods.rs -> mask_roundtrip.rs

---
 .../harness/lm_generated/{mask_methods.rs => mask_roundtrip.rs}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename fearless_simd_tests/tests/harness/lm_generated/{mask_methods.rs => mask_roundtrip.rs} (100%)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
similarity index 100%
rename from fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
rename to fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs

From 9d9adf8b674b46e44bd05cb8704adbb2803598fb Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 18:38:34 +0100
Subject: [PATCH 05/55] Check in the new generated AVX-512 file

---
 fearless_simd/src/generated/avx512.rs | 9604 +++++++++++++++++++++++++
 1 file changed, 9604 insertions(+)
 create mode 100644 fearless_simd/src/generated/avx512.rs

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
new file mode 100644
index 000000000..3b8bd1af9
--- /dev/null
+++ b/fearless_simd/src/generated/avx512.rs
@@ -0,0 +1,9604 @@
+// Copyright 2025 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// This file is autogenerated by fearless_simd_gen
+
+use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
+use crate::{
+    f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
+    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
+    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
+    u32x4, u32x8, u32x16,
+};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+#[doc = "A token for AVX-512 intrinsics on `x86` and `x86_64`, representing an Ice Lake feature level."]
+#[derive(Clone, Copy, Debug)]
+pub struct Avx512 {
+    _private: (),
+}
+impl Avx512 {
+    #[doc = r" Create a SIMD token."]
+    #[doc = r""]
+    #[doc = r" # Safety"]
+    #[doc = r""]
+    #[doc = r" The Ice Lake AVX-512 CPU feature set must be available."]
+    #[inline]
+    pub const unsafe fn new_unchecked() -> Self {
+        Self { _private: () }
+    }
+}
+impl Seal for Avx512 {}
+impl ArchTypes for Avx512 {
+    type f32x4 = crate::support::Aligned128<__m128>;
+    type i8x16 = crate::support::Aligned128<__m128i>;
+    type u8x16 = crate::support::Aligned128<__m128i>;
+    type mask8x16 = __mmask16;
+    type i16x8 = crate::support::Aligned128<__m128i>;
+    type u16x8 = crate::support::Aligned128<__m128i>;
+    type mask16x8 = __mmask8;
+    type i32x4 = crate::support::Aligned128<__m128i>;
+    type u32x4 = crate::support::Aligned128<__m128i>;
+    type mask32x4 = __mmask8;
+    type f64x2 = crate::support::Aligned128<__m128d>;
+    type mask64x2 = __mmask8;
+    type f32x8 = crate::support::Aligned256<__m256>;
+    type i8x32 = crate::support::Aligned256<__m256i>;
+    type u8x32 = crate::support::Aligned256<__m256i>;
+    type mask8x32 = __mmask32;
+    type i16x16 = crate::support::Aligned256<__m256i>;
+    type u16x16 = crate::support::Aligned256<__m256i>;
+    type mask16x16 = __mmask16;
+    type i32x8 = crate::support::Aligned256<__m256i>;
+    type u32x8 = crate::support::Aligned256<__m256i>;
+    type mask32x8 = __mmask8;
+    type f64x4 = crate::support::Aligned256<__m256d>;
+    type mask64x4 = __mmask8;
+    type f32x16 = crate::support::Aligned512<__m512>;
+    type i8x64 = crate::support::Aligned512<__m512i>;
+    type u8x64 = crate::support::Aligned512<__m512i>;
+    type mask8x64 = __mmask64;
+    type i16x32 = crate::support::Aligned512<__m512i>;
+    type u16x32 = crate::support::Aligned512<__m512i>;
+    type mask16x32 = __mmask32;
+    type i32x16 = crate::support::Aligned512<__m512i>;
+    type u32x16 = crate::support::Aligned512<__m512i>;
+    type mask32x16 = __mmask16;
+    type f64x8 = crate::support::Aligned512<__m512d>;
+    type mask64x8 = __mmask8;
+}
+impl Simd for Avx512 {
+    type f32s = f32x16<Self>;
+    type f64s = f64x8<Self>;
+    type u8s = u8x64<Self>;
+    type i8s = i8x64<Self>;
+    type u16s = u16x32<Self>;
+    type i16s = i16x32<Self>;
+    type u32s = u32x16<Self>;
+    type i32s = i32x16<Self>;
+    type mask8s = mask8x64<Self>;
+    type mask16s = mask16x32<Self>;
+    type mask32s = mask32x16<Self>;
+    type mask64s = mask64x8<Self>;
+    #[inline(always)]
+    fn level(self) -> Level {
+        Level::Avx512(self)
+    }
+    #[inline]
+    fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
+        #[target_feature(
+            enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves"
+        )]
+        unsafe fn vectorize_avx512<F: FnOnce() -> R, R>(f: F) -> R {
+            f()
+        }
+        unsafe { vectorize_avx512(f) }
+    }
+    #[inline(always)]
+    fn splat_f32x4(self, val: f32) -> f32x4<Self> {
+        unsafe { _mm_set1_ps(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
+        f32x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
+        f32x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
+        unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
+        unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
+        unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f32,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
+        unsafe {
+            f32x4 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_f32x4(b).val.0,
+                self.cvt_to_bytes_f32x4(a).val.0,
+                SHIFT * 4usize,
+            );
+            self.cvt_from_bytes_f32x4(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x4<const SHIFT: usize>(
+        self,
+        a: f32x4<Self>,
+        b: f32x4<Self>,
+    ) -> f32x4<Self> {
+        self.slide_f32x4::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            let mask = _mm_set1_ps(-0.0);
+            _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn interleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        (self.zip_low_f32x4(a, b), self.zip_high_f32x4(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        (self.unzip_low_f32x4(a, b), self.unzip_high_f32x4(a, b))
+    }
+    #[inline(always)]
+    fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            let intermediate = _mm_max_ps(a.into(), b.into());
+            let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into());
+            _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            let intermediate = _mm_min_ps(a.into(), b.into());
+            let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into());
+            _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        a - self.trunc_f32x4(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            _mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
+        unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
+        unsafe { _mm_castps_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
+        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_castps_si128(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            let mut converted = _mm_cvttps_epi32(a.into());
+            let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0));
+            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
+            if !all_in_range {
+                let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0));
+                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
+                converted = _mm_add_epi32(converted, excess_converted);
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            let a = _mm_max_ps(a.into(), _mm_setzero_ps());
+            let mut converted = _mm_cvttps_epi32(a);
+            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
+            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
+            if !all_in_range {
+                let exceeds_unsigned_range =
+                    _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a));
+                let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0));
+                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
+                converted = _mm_add_epi32(converted, excess_converted);
+                converted = _mm_blendv_epi8(
+                    converted,
+                    _mm_set1_epi32(u32::MAX.cast_signed()),
+                    exceeds_unsigned_range,
+                );
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            let a = a.into();
+            let mut converted = _mm_cvttps_epi32(a);
+            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
+            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
+            if !all_in_range {
+                converted = _mm_blendv_epi8(
+                    _mm_set1_epi32(i32::MAX),
+                    converted,
+                    _mm_castps_si128(in_range),
+                );
+                let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
+                converted = _mm_and_si128(converted, is_not_nan);
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_i8x16(self, val: i8) -> i8x16<Self> {
+        unsafe { _mm_set1_epi8(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
+        i8x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
+        i8x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
+        unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
+        unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
+        unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i8,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            i8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_i8x16(b).val.0,
+                self.cvt_to_bytes_i8x16(a).val.0,
+                SHIFT,
+            );
+            self.cvt_from_bytes_i8x16(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x16<const SHIFT: usize>(
+        self,
+        a: i8x16<Self>,
+        b: i8x16<Self>,
+    ) -> i8x16<Self> {
+        self.slide_i8x16::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
+            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
+            let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
+            _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpeq_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmplt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmple_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpge_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpgt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        (self.zip_low_i8x16(a, b), self.zip_high_i8x16(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b))
+    }
+    #[inline(always)]
+    fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
+        unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u8x16(self, val: u8) -> u8x16<Self> {
+        unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
+        u8x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
+        u8x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
+        unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
+        unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
+        unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u8,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_u8x16(b).val.0,
+                self.cvt_to_bytes_u8x16(a).val.0,
+                SHIFT,
+            );
+            self.cvt_from_bytes_u8x16(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x16<const SHIFT: usize>(
+        self,
+        a: u8x16<Self>,
+        b: u8x16<Self>,
+    ) -> u8x16<Self> {
+        self.slide_u8x16::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
+            let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
+            let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
+            let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
+            let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpeq_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmplt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmple_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpge_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
+        unsafe {
+            mask8x16 {
+                val: _mm_cmpgt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        (self.zip_low_u8x16(a, b), self.zip_high_u8x16(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b))
+    }
+    #[inline(always)]
+    fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
+        unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_cvtepu8_epi16(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask8x16(self, val: bool) -> mask8x16<Self> {
+        mask8x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 16usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask8x16 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
+        mask8x16 {
+            val: (bits & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
+    }
+    #[inline(always)]
+    fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask8x16(
+        self,
+        a: mask8x16<Self>,
+        b: mask8x16<Self>,
+        c: mask8x16<Self>,
+    ) -> mask8x16<Self> {
+        mask8x16 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
+        mask8x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
+    }
+    #[inline(always)]
+    fn any_false_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
+    }
+    #[inline(always)]
+    fn all_false_mask8x16(self, a: mask8x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64;
+        mask8x32 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_i16x8(self, val: i16) -> i16x8<Self> {
+        unsafe { _mm_set1_epi16(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
+        i16x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
+        i16x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
+        unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
+        unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
+        unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i16,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
+        unsafe {
+            i16x8 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_i16x8(b).val.0,
+                self.cvt_to_bytes_i16x8(a).val.0,
+                SHIFT * 2usize,
+            );
+            self.cvt_from_bytes_i16x8(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x8<const SHIFT: usize>(
+        self,
+        a: i16x8<Self>,
+        b: i16x8<Self>,
+    ) -> i16x8<Self> {
+        self.slide_i16x8::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
+        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
+        unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpeq_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmplt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmple_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpge_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpgt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        (self.zip_low_i16x8(a, b), self.zip_high_i16x8(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b))
+    }
+    #[inline(always)]
+    fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
+        unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u16x8(self, val: u16) -> u16x8<Self> {
+        unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
+        u16x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
+        u16x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
+        unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
+        unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
+        unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u16,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
+        unsafe {
+            u16x8 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_u16x8(b).val.0,
+                self.cvt_to_bytes_u16x8(a).val.0,
+                SHIFT * 2usize,
+            );
+            self.cvt_from_bytes_u16x8(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x8<const SHIFT: usize>(
+        self,
+        a: u16x8<Self>,
+        b: u16x8<Self>,
+    ) -> u16x8<Self> {
+        self.slide_u16x8::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
+        unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
+        unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpeq_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmplt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmple_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpge_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
+        unsafe {
+            mask16x8 {
+                val: _mm_cmpgt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe {
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let t1 = _mm_shuffle_epi8(a.into(), mask);
+            let t2 = _mm_shuffle_epi8(b.into(), mask);
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        (self.zip_low_u16x8(a, b), self.zip_high_u16x8(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b))
+    }
+    #[inline(always)]
+    fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
+        unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask16x8(self, val: bool) -> mask16x8<Self> {
+        mask16x8 {
+            val: (if val { 255u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 8usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask16x8 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
+        mask16x8 {
+            val: (bits & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
+        u64::from((a).val) & 255u64
+    }
+    #[inline(always)]
+    fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: ((!u64::from((a).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask16x8(
+        self,
+        a: mask16x8<Self>,
+        b: mask16x8<Self>,
+        c: mask16x8<Self>,
+    ) -> mask16x8<Self> {
+        mask16x8 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
+        mask16x8 {
+            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 255u64
+    }
+    #[inline(always)]
+    fn any_false_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 255u64
+    }
+    #[inline(always)]
+    fn all_false_mask16x8(self, a: mask16x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64;
+        mask16x16 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_i32x4(self, val: i32) -> i32x4<Self> {
+        unsafe { _mm_set1_epi32(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
+        i32x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
+        i32x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
+        unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
+        unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
+        unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i32,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
+        unsafe {
+            i32x4 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_i32x4(b).val.0,
+                self.cvt_to_bytes_i32x4(a).val.0,
+                SHIFT * 4usize,
+            );
+            self.cvt_from_bytes_i32x4(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x4<const SHIFT: usize>(
+        self,
+        a: i32x4<Self>,
+        b: i32x4<Self>,
+    ) -> i32x4<Self> {
+        self.slide_i32x4::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
+        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
+        unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpeq_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmplt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmple_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpge_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpgt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
+        (self.zip_low_i32x4(a, b), self.zip_high_i32x4(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
+        (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b))
+    }
+    #[inline(always)]
+    fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
+        unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_u32x4(self, val: u32) -> u32x4<Self> {
+        unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
+        u32x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
+        u32x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
+        unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
+        unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
+        unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u32,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
+        unsafe {
+            u32x4 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_u32x4(b).val.0,
+                self.cvt_to_bytes_u32x4(a).val.0,
+                SHIFT * 4usize,
+            );
+            self.cvt_from_bytes_u32x4(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x4<const SHIFT: usize>(
+        self,
+        a: u32x4<Self>,
+        b: u32x4<Self>,
+    ) -> u32x4<Self> {
+        self.slide_u32x4::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
+        unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
+        unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpeq_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmplt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmple_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpge_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
+        unsafe {
+            mask32x4 {
+                val: _mm_cmpgt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpacklo_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe {
+            let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
+            let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
+            _mm_unpackhi_epi64(t1, t2).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        (self.zip_low_u32x4(a, b), self.zip_high_u32x4(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b))
+    }
+    #[inline(always)]
+    fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
+        unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
+        unsafe { _mm256_setr_m128i(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
+        __m128i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
+        unsafe {
+            let a = a.into();
+            let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
+            let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
+            let fhi = _mm_sub_ps(
+                _mm_castsi128_ps(hi),
+                _mm_set1_ps(f32::from_bits(0x53000080)),
+            );
+            let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
+            result.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_mask32x4(self, val: bool) -> mask32x4<Self> {
+        mask32x4 {
+            val: (if val { 15u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 4usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask32x4 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
+        mask32x4 {
+            val: (bits & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
+        u64::from((a).val) & 15u64
+    }
+    #[inline(always)]
+    fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: ((!u64::from((a).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask32x4(
+        self,
+        a: mask32x4<Self>,
+        b: mask32x4<Self>,
+        c: mask32x4<Self>,
+    ) -> mask32x4<Self> {
+        mask32x4 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
+        mask32x4 {
+            val: (!u64::from(a.val ^ b.val) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 15u64
+    }
+    #[inline(always)]
+    fn any_false_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 15u64
+    }
+    #[inline(always)]
+    fn all_false_mask32x4(self, a: mask32x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64;
+        mask32x8 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_f64x2(self, val: f64) -> f64x2<Self> {
+        unsafe { _mm_set1_pd(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
+        f64x2 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
+        f64x2 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
+        unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
+        unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
+        unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f64,
+                dest.as_mut_ptr(),
+                2usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
+        unsafe {
+            f64x2 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
+        unsafe {
+            u8x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            if SHIFT >= 2usize {
+                return b;
+            }
+            let result = dyn_alignr_128(
+                self.cvt_to_bytes_f64x2(b).val.0,
+                self.cvt_to_bytes_f64x2(a).val.0,
+                SHIFT * 8usize,
+            );
+            self.cvt_from_bytes_f64x2(u8x16 {
+                val: crate::support::Aligned128(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f64x2<const SHIFT: usize>(
+        self,
+        a: f64x2<Self>,
+        b: f64x2<Self>,
+    ) -> f64x2<Self> {
+        self.slide_f64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        1.0 / a
+    }
+    #[inline(always)]
+    fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            let mask = _mm_set1_pd(-0.0);
+            _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
+        unsafe {
+            mask64x2 {
+                val: _mm_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn interleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        (self.zip_low_f64x2(a, b), self.zip_high_f64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        (self.unzip_low_f64x2(a, b), self.unzip_high_f64x2(a, b))
+    }
+    #[inline(always)]
+    fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            let intermediate = _mm_max_pd(a.into(), b.into());
+            let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into());
+            _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            let intermediate = _mm_min_pd(a.into(), b.into());
+            let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into());
+            _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        a - self.trunc_f64x2(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
+        unsafe {
+            _mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
+        unsafe { _mm256_setr_m128d(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
+        unsafe { _mm_castpd_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+        mask64x2 {
+            val: (if val { 3u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 2usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask64x2 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        mask64x2 {
+            val: (bits & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        u64::from((a).val) & 3u64
+    }
+    #[inline(always)]
+    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((!u64::from((a).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask64x2(
+        self,
+        a: mask64x2<Self>,
+        b: mask64x2<Self>,
+        c: mask64x2<Self>,
+    ) -> mask64x2<Self> {
+        mask64x2 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: (!u64::from(a.val ^ b.val) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits == 3u64
+    }
+    #[inline(always)]
+    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits != 3u64
+    }
+    #[inline(always)]
+    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 2usize)) & 15u64;
+        mask64x4 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+        unsafe { _mm256_set1_ps(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
+        unsafe { core::mem::transmute::<__m256, [f32; 8usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
+        unsafe { core::mem::transmute::<&__m256, &[f32; 8usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
+        unsafe { core::mem::transmute::<&mut __m256, &mut [f32; 8usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f32,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
+        unsafe {
+            f32x8 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_f32x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_f32x8(b).val.0,
+            );
+            self.cvt_from_bytes_f32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x8<const SHIFT: usize>(
+        self,
+        a: f32x8<Self>,
+        b: f32x8<Self>,
+    ) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_sqrt_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_rcp_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_add_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_sub_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_mul_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_div_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            let mask = _mm256_set1_ps(-0.0);
+            _mm256_or_ps(
+                _mm256_and_ps(mask, b.into()),
+                _mm256_andnot_ps(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_ps(
+                a.into(),
+                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_max_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_min_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            let intermediate = _mm256_max_ps(a.into(), b.into());
+            let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into());
+            _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            let intermediate = _mm256_min_ps(a.into(), b.into());
+            let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into());
+            _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        a - self.trunc_f32x8(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        unsafe {
+            (
+                _mm256_extractf128_ps::<0>(a.into()).simd_into(self),
+                _mm256_extractf128_ps::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+        unsafe { _mm256_castps_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_castps_si256(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            let mut converted = _mm256_cvttps_epi32(a.into());
+            let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0));
+            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
+            if !all_in_range {
+                let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0));
+                let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
+                converted = _mm256_add_epi32(converted, excess_converted);
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
+            let mut converted = _mm256_cvttps_epi32(a);
+            let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
+            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
+            if !all_in_range {
+                let exceeds_unsigned_range =
+                    _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a));
+                let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0));
+                let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
+                converted = _mm256_add_epi32(converted, excess_converted);
+                converted = _mm256_blendv_epi8(
+                    converted,
+                    _mm256_set1_epi32(u32::MAX.cast_signed()),
+                    exceeds_unsigned_range,
+                );
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_cvttps_epi32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            let a = a.into();
+            let mut converted = _mm256_cvttps_epi32(a);
+            let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
+            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
+            if !all_in_range {
+                converted = _mm256_blendv_epi8(
+                    _mm256_set1_epi32(i32::MAX),
+                    converted,
+                    _mm256_castps_si256(in_range),
+                );
+                let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a));
+                converted = _mm256_and_si256(converted, is_not_nan);
+            }
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+        unsafe { _mm256_set1_epi8(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
+        unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
+        unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
+        unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i8,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            i8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_i8x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_i8x32(b).val.0,
+            );
+            self.cvt_from_bytes_i8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+        self,
+        a: i8x32<Self>,
+        b: i8x32<Self>,
+    ) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(
+            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
+            _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+            let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
+            _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpeq_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmplt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmple_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpge_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpgt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42,
+                    11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57,
+                    26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40,
+                    42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+                    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
+                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
+                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
+                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_min_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_max_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        unsafe { _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+        unsafe { _mm256_set1_epi8(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
+        unsafe { core::mem::transmute::<__m256i, [u8; 32usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
+        unsafe { core::mem::transmute::<&__m256i, &[u8; 32usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
+        unsafe { core::mem::transmute::<&mut __m256i, &mut [u8; 32usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u8,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_u8x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_u8x32(b).val.0,
+            );
+            self.cvt_from_bytes_u8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+        self,
+        a: u8x32<Self>,
+        b: u8x32<Self>,
+    ) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(
+            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+            let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+            let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+            let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
+            let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpeq_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmplt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmple_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpge_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        unsafe {
+            mask8x32 {
+                val: _mm256_cmpgt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42,
+                    11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57,
+                    26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40,
+                    42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe {
+            _mm256_permutex2var_epi8(
+                a.into(),
+                _mm256_setr_epi8(
+                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
+                    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
+                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
+                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi8(
+                    a,
+                    _mm256_setr_epi8(
+                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
+                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_min_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        unsafe { _mm256_max_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_cvtepu8_epi16(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
+        mask8x32 {
+            val: (if val { 4294967295u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 32usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask8x32 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        mask8x32 {
+            val: (bits & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        u64::from((a).val) & 4294967295u64
+    }
+    #[inline(always)]
+    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((!u64::from((a).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask8x32(
+        self,
+        a: mask8x32<Self>,
+        b: mask8x32<Self>,
+        c: mask8x32<Self>,
+    ) -> mask8x32<Self> {
+        mask8x32 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 4294967295u64
+    }
+    #[inline(always)]
+    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 4294967295u64
+    }
+    #[inline(always)]
+    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 32usize)) & u64::MAX;
+        mask8x64 {
+            val: bits,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask8x16 {
+                val: (bits & 65535u64) as _,
+                simd: self,
+            },
+            mask8x16 {
+                val: ((bits >> 16usize) & 65535u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+        unsafe { _mm256_set1_epi16(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
+        unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
+        unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
+        unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i16,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
+        unsafe {
+            i16x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_i16x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_i16x16(b).val.0,
+            );
+            self.cvt_from_bytes_i16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x16<const SHIFT: usize>(
+        self,
+        a: i16x16<Self>,
+        b: i16x16<Self>,
+    ) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(
+            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        unsafe {
+            _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        unsafe {
+            _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpeq_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmplt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmple_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpge_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpgt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_min_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_max_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        unsafe { _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+        unsafe { _mm256_set1_epi16(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
+        unsafe { core::mem::transmute::<__m256i, [u16; 16usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
+        unsafe { core::mem::transmute::<&__m256i, &[u16; 16usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
+        unsafe { core::mem::transmute::<&mut __m256i, &mut [u16; 16usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u16,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
+        unsafe {
+            u16x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_u16x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_u16x16(b).val.0,
+            );
+            self.cvt_from_bytes_u16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+        self,
+        a: u16x16<Self>,
+        b: u16x16<Self>,
+    ) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(
+            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        unsafe {
+            _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        unsafe {
+            _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpeq_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmplt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmple_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpge_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        unsafe {
+            mask16x16 {
+                val: _mm256_cmpgt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe {
+            _mm256_permutex2var_epi16(
+                a.into(),
+                _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm256_permutex2var_epi16(
+                    a,
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_min_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        unsafe { _mm256_max_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+        unsafe { _mm256_cvtepi16_epi8(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
+        mask16x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 16usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask16x16 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        mask16x16 {
+            val: (bits & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
+    }
+    #[inline(always)]
+    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask16x16(
+        self,
+        a: mask16x16<Self>,
+        b: mask16x16<Self>,
+        c: mask16x16<Self>,
+    ) -> mask16x16<Self> {
+        mask16x16 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
+    }
+    #[inline(always)]
+    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
+    }
+    #[inline(always)]
+    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64;
+        mask16x32 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask16x8 {
+                val: (bits & 255u64) as _,
+                simd: self,
+            },
+            mask16x8 {
+                val: ((bits >> 8usize) & 255u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+        unsafe { _mm256_set1_epi32(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
+        unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
+        unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
+        unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i32,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
+        unsafe {
+            i32x8 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_i32x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_i32x8(b).val.0,
+            );
+            self.cvt_from_bytes_i32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x8<const SHIFT: usize>(
+        self,
+        a: i32x8<Self>,
+        b: i32x8<Self>,
+    ) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(
+            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        unsafe {
+            _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        unsafe {
+            _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_srav_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpeq_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmplt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmple_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpge_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpgt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_min_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_max_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        unsafe { _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_cvtepi32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+        unsafe { _mm256_set1_epi32(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
+        unsafe { core::mem::transmute::<__m256i, [u32; 8usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
+        unsafe { core::mem::transmute::<&__m256i, &[u32; 8usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
+        unsafe { core::mem::transmute::<&mut __m256i, &mut [u32; 8usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u32,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
+        unsafe {
+            u32x8 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_u32x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_u32x8(b).val.0,
+            );
+            self.cvt_from_bytes_u32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+        self,
+        a: u32x8<Self>,
+        b: u32x8<Self>,
+    ) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(
+            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_or_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_xor_si256(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        unsafe {
+            _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        unsafe {
+            _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_srlv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpeq_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmplt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmple_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpge_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        unsafe {
+            mask32x8 {
+                val: _mm256_cmpgt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe {
+            _mm256_permutex2var_epi32(
+                a.into(),
+                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_min_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        unsafe { _mm256_max_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        unsafe {
+            (
+                _mm256_extracti128_si256::<0>(a.into()).simd_into(self),
+                _mm256_extracti128_si256::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        __m256i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
+        unsafe {
+            let a = a.into();
+            let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000));
+            let hi = _mm256_blend_epi16::<0xAA>(
+                _mm256_srli_epi32::<16>(a),
+                _mm256_set1_epi32(0x53000000),
+            );
+            let fhi = _mm256_sub_ps(
+                _mm256_castsi256_ps(hi),
+                _mm256_set1_ps(f32::from_bits(0x53000080)),
+            );
+            let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi);
+            result.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
+        mask32x8 {
+            val: (if val { 255u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 8usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask32x8 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        mask32x8 {
+            val: (bits & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        u64::from((a).val) & 255u64
+    }
+    #[inline(always)]
+    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((!u64::from((a).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask32x8(
+        self,
+        a: mask32x8<Self>,
+        b: mask32x8<Self>,
+        c: mask32x8<Self>,
+    ) -> mask32x8<Self> {
+        mask32x8 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 255u64
+    }
+    #[inline(always)]
+    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 255u64
+    }
+    #[inline(always)]
+    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64;
+        mask32x16 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask32x4 {
+                val: (bits & 15u64) as _,
+                simd: self,
+            },
+            mask32x4 {
+                val: ((bits >> 4usize) & 15u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
+        unsafe { _mm256_set1_pd(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
+        unsafe { core::mem::transmute::<__m256d, [f64; 4usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
+        unsafe { core::mem::transmute::<&__m256d, &[f64; 4usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
+        unsafe { core::mem::transmute::<&mut __m256d, &mut [f64; 4usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f64,
+                dest.as_mut_ptr(),
+                4usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
+        unsafe {
+            f64x4 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
+        unsafe {
+            u8x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let idx = _mm256_add_epi8(
+                _mm256_setr_epi8(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ),
+                _mm256_set1_epi8((SHIFT * 8usize) as i8),
+            );
+            let result = _mm256_permutex2var_epi8(
+                self.cvt_to_bytes_f64x4(a).val.0,
+                idx,
+                self.cvt_to_bytes_f64x4(b).val.0,
+            );
+            self.cvt_from_bytes_f64x4(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f64x4<const SHIFT: usize>(
+        self,
+        a: f64x4<Self>,
+        b: f64x4<Self>,
+    ) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_sqrt_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        1.0 / a
+    }
+    #[inline(always)]
+    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_add_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_sub_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_mul_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_div_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            let mask = _mm256_set1_pd(-0.0);
+            _mm256_or_pd(
+                _mm256_and_pd(mask, b.into()),
+                _mm256_andnot_pd(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        unsafe {
+            mask64x4 {
+                val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(self),
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(self),
+                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_max_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_min_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            let intermediate = _mm256_max_pd(a.into(), b.into());
+            let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into());
+            _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            let intermediate = _mm256_min_pd(a.into(), b.into());
+            let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into());
+            _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        a - self.trunc_f64x4(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        unsafe {
+            _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        unsafe {
+            (
+                _mm256_extractf128_pd::<0>(a.into()).simd_into(self),
+                _mm256_extractf128_pd::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+        unsafe { _mm256_castpd_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+        mask64x4 {
+            val: (if val { 15u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 4usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask64x4 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        mask64x4 {
+            val: (bits & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        u64::from((a).val) & 15u64
+    }
+    #[inline(always)]
+    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((!u64::from((a).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask64x4(
+        self,
+        a: mask64x4<Self>,
+        b: mask64x4<Self>,
+        c: mask64x4<Self>,
+    ) -> mask64x4<Self> {
+        mask64x4 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: (!u64::from(a.val ^ b.val) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 15u64
+    }
+    #[inline(always)]
+    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 15u64
+    }
+    #[inline(always)]
+    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64;
+        mask64x8 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask64x2 {
+                val: (bits & 3u64) as _,
+                simd: self,
+            },
+            mask64x2 {
+                val: ((bits >> 2usize) & 3u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
+        unsafe { _mm512_set1_ps(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
+        unsafe { core::mem::transmute::<__m512, [f32; 16usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
+        unsafe { core::mem::transmute::<&__m512, &[f32; 16usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
+        unsafe { core::mem::transmute::<&mut __m512, &mut [f32; 16usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f32,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
+        unsafe {
+            f32x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_f32x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_f32x16(b).val.0,
+            );
+            self.cvt_from_bytes_f32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+        self,
+        a: f32x16<Self>,
+        b: f32x16<Self>,
+    ) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_sqrt_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_add_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_sub_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_mul_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_div_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            let mask = _mm512_set1_ps(-0.0);
+            _mm512_or_ps(
+                _mm512_and_ps(mask, b.into()),
+                _mm512_andnot_ps(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_ps(
+                a.into(),
+                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_ps(
+                    a,
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_max_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_min_ps(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            let intermediate = _mm512_max_ps(a.into(), b.into());
+            let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into());
+            _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        unsafe {
+            let intermediate = _mm512_min_ps(a.into(), b.into());
+            let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into());
+            _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+    }
+    #[inline(always)]
+    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.round_ties_even_f32x8(a0),
+            self.round_ties_even_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        a - self.trunc_f32x16(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+    }
+    #[inline(always)]
+    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        unsafe {
+            (
+                _mm512_castps512_ps256(a.into()).simd_into(self),
+                _mm512_extractf32x8_ps::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        unsafe { _mm512_castps_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_castps_si512(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        unsafe {
+            let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
+            let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
+            let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
+            let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
+            let tmp0 = _mm_unpacklo_ps(v0, v1);
+            let tmp1 = _mm_unpackhi_ps(v0, v1);
+            let tmp2 = _mm_unpacklo_ps(v2, v3);
+            let tmp3 = _mm_unpackhi_ps(v2, v3);
+            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+            self.combine_f32x8(
+                self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
+                self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
+            )
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        let (v01, v23) = self.split_f32x16(a);
+        let (v0, v1) = self.split_f32x8(v01);
+        let (v2, v3) = self.split_f32x8(v23);
+        let v0 = v0.into();
+        let v1 = v1.into();
+        let v2 = v2.into();
+        let v3 = v3.into();
+        unsafe {
+            let tmp0 = _mm_unpacklo_ps(v0, v1);
+            let tmp1 = _mm_unpackhi_ps(v0, v1);
+            let tmp2 = _mm_unpacklo_ps(v2, v3);
+            let tmp3 = _mm_unpackhi_ps(v2, v3);
+            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+            _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
+            _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
+            _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
+            _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        unsafe { _mm512_castps_si512(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_castps_si512(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_cvttps_epu32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            let a = _mm512_max_ps(a.into(), _mm512_setzero_ps());
+            let mut converted = _mm512_cvttps_epu32(a);
+            let exceeds_unsigned_range =
+                _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a);
+            converted = _mm512_mask_blend_epi32(
+                exceeds_unsigned_range,
+                converted,
+                _mm512_set1_epi32(u32::MAX.cast_signed()),
+            );
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_cvttps_epi32(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            let a = a.into();
+            let mut converted = _mm512_cvttps_epi32(a);
+            let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0));
+            converted = _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted);
+            let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a);
+            converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted);
+            converted.simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+        unsafe { _mm512_set1_epi8(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
+        unsafe { core::mem::transmute::<__m512i, [i8; 64usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
+        unsafe { core::mem::transmute::<&__m512i, &[i8; 64usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
+        unsafe { core::mem::transmute::<&mut __m512i, &mut [i8; 64usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i8,
+                dest.as_mut_ptr(),
+                64usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            i8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            if SHIFT >= 64usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_i8x64(a).val.0,
+                idx,
+                self.cvt_to_bytes_i8x64(b).val.0,
+            );
+            self.cvt_from_bytes_i8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+        self,
+        a: i8x64<Self>,
+        b: i8x64<Self>,
+    ) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(
+            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            let dst_even = _mm512_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm512_mullo_epi16(
+                _mm512_srli_epi16::<8>(a.into()),
+                _mm512_srli_epi16::<8>(b.into()),
+            );
+            _mm512_or_si512(
+                _mm512_slli_epi16(dst_odd, 8),
+                _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let hi_16 = _mm512_unpackhi_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
+            _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let hi_16 = _mm512_unpackhi_epi8(
+                val,
+                _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+            );
+            let lo_shifted = _mm512_sra_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_sra_epi16(hi_16, shift_count);
+            _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpeq_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmplt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmple_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpge_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpgt_epi8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22,
+                    85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12,
+                    75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1,
+                    64, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
+                    55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
+                    110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
+                    38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                    94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56,
+                    54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16,
+                    14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                    95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57,
+                    55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17,
+                    15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
+                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
+                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
+                        66, 2, 65, 1, 64, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
+                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
+                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
+                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
+                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
+                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
+                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_min_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_max_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        unsafe { _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
+        unsafe { _mm512_set1_epi8(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
+        unsafe { core::mem::transmute::<__m512i, [u8; 64usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
+        unsafe { core::mem::transmute::<&__m512i, &[u8; 64usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
+        unsafe { core::mem::transmute::<&mut __m512i, &mut [u8; 64usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u8,
+                dest.as_mut_ptr(),
+                64usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            if SHIFT >= 64usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_u8x64(a).val.0,
+                idx,
+                self.cvt_to_bytes_u8x64(b).val.0,
+            );
+            self.cvt_from_bytes_u8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+        self,
+        a: u8x64<Self>,
+        b: u8x64<Self>,
+    ) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(
+            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_add_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_sub_epi8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            let dst_even = _mm512_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm512_mullo_epi16(
+                _mm512_srli_epi16::<8>(a.into()),
+                _mm512_srli_epi16::<8>(b.into()),
+            );
+            _mm512_or_si512(
+                _mm512_slli_epi16(dst_odd, 8),
+                _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
+            let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
+            let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        unsafe {
+            let val = a.into();
+            let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+            let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
+            let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
+            let lo_shifted = _mm512_srl_epi16(lo_16, shift_count);
+            let hi_shifted = _mm512_srl_epi16(hi_16, shift_count);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpeq_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmplt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmple_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpge_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        unsafe {
+            mask8x64 {
+                val: _mm512_cmpgt_epu8_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22,
+                    85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12,
+                    75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1,
+                    64, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
+                    55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
+                    110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
+                    38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                    94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56,
+                    54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16,
+                    14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe {
+            _mm512_permutex2var_epi8(
+                a.into(),
+                _mm512_set_epi8(
+                    127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                    95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57,
+                    55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17,
+                    15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
+                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
+                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
+                        66, 2, 65, 1, 64, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
+                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
+                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
+                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi8(
+                    a,
+                    _mm512_set_epi8(
+                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
+                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
+                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
+                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_min_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        unsafe { _mm512_max_epu8(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
+        unsafe {
+            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
+            let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
+            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
+            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
+            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+            let v0 = _mm_shuffle_epi8(v0, mask);
+            let v1 = _mm_shuffle_epi8(v1, mask);
+            let v2 = _mm_shuffle_epi8(v2, mask);
+            let v3 = _mm_shuffle_epi8(v3, mask);
+            let tmp0 = _mm_unpacklo_epi32(v0, v1);
+            let tmp1 = _mm_unpackhi_epi32(v0, v1);
+            let tmp2 = _mm_unpacklo_epi32(v2, v3);
+            let tmp3 = _mm_unpackhi_epi32(v2, v3);
+            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+            self.combine_u8x32(
+                self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
+                self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
+            )
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        let (v01, v23) = self.split_u8x64(a);
+        let (v0, v1) = self.split_u8x32(v01);
+        let (v2, v3) = self.split_u8x32(v23);
+        let v0 = v0.into();
+        let v1 = v1.into();
+        let v2 = v2.into();
+        let v3 = v3.into();
+        unsafe {
+            let tmp0 = _mm_unpacklo_epi32(v0, v1);
+            let tmp1 = _mm_unpackhi_epi32(v0, v1);
+            let tmp2 = _mm_unpacklo_epi32(v2, v3);
+            let tmp3 = _mm_unpackhi_epi32(v2, v3);
+            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+            let out0 = _mm_shuffle_epi8(out0, mask);
+            let out1 = _mm_shuffle_epi8(out1, mask);
+            let out2 = _mm_shuffle_epi8(out2, mask);
+            let out3 = _mm_shuffle_epi8(out3, mask);
+            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
+            _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
+            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
+            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
+        mask8x64 {
+            val: if val { u64::MAX } else { 0 },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 64usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask8x64 {
+            val: bits,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        mask8x64 {
+            val: bits & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        u64::from((a).val) & u64::MAX
+    }
+    #[inline(always)]
+    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) | u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) ^ u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (!u64::from((a).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask8x64(
+        self,
+        a: mask8x64<Self>,
+        b: mask8x64<Self>,
+        c: mask8x64<Self>,
+    ) -> mask8x64<Self> {
+        mask8x64 {
+            val: ((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: !u64::from(a.val ^ b.val) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits == u64::MAX
+    }
+    #[inline(always)]
+    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits != u64::MAX
+    }
+    #[inline(always)]
+    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask8x32 {
+                val: (bits & 4294967295u64) as _,
+                simd: self,
+            },
+            mask8x32 {
+                val: ((bits >> 32usize) & 4294967295u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
+        unsafe { _mm512_set1_epi16(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
+        unsafe { core::mem::transmute::<__m512i, [i16; 32usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
+        unsafe { core::mem::transmute::<&__m512i, &[i16; 32usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
+        unsafe { core::mem::transmute::<&mut __m512i, &mut [i16; 32usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i16,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
+        unsafe {
+            i16x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_i16x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_i16x32(b).val.0,
+            );
+            self.cvt_from_bytes_i16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+        self,
+        a: i16x32<Self>,
+        b: i16x32<Self>,
+    ) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        unsafe {
+            _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        unsafe {
+            _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpeq_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmplt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmple_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpge_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpgt_epi16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37,
+                    5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22,
+                    53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24,
+                    22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25,
+                    23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
+                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
+                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
+                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_min_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_max_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        unsafe { _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
+        unsafe { _mm512_set1_epi16(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
+        unsafe { core::mem::transmute::<__m512i, [u16; 32usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
+        unsafe { core::mem::transmute::<&__m512i, &[u16; 32usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
+        unsafe { core::mem::transmute::<&mut __m512i, &mut [u16; 32usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u16,
+                dest.as_mut_ptr(),
+                32usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
+        unsafe {
+            u16x32 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            if SHIFT >= 32usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 2usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_u16x32(a).val.0,
+                idx,
+                self.cvt_to_bytes_u16x32(b).val.0,
+            );
+            self.cvt_from_bytes_u16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+        self,
+        a: u16x32<Self>,
+        b: u16x32<Self>,
+    ) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_add_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_sub_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_mullo_epi16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        unsafe {
+            _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        unsafe {
+            _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpeq_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmplt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmple_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpge_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        unsafe {
+            mask16x32 {
+                val: _mm512_cmpgt_epu16_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37,
+                    5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22,
+                    53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24,
+                    22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe {
+            _mm512_permutex2var_epi16(
+                a.into(),
+                _mm512_set_epi16(
+                    63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25,
+                    23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                ),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
+                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
+                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi16(
+                    a,
+                    _mm512_set_epi16(
+                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
+                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_min_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        unsafe { _mm512_max_epu16(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+        unsafe {
+            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
+            let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
+            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
+            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
+            let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+            let v0 = _mm_shuffle_epi8(v0, mask);
+            let v1 = _mm_shuffle_epi8(v1, mask);
+            let v2 = _mm_shuffle_epi8(v2, mask);
+            let v3 = _mm_shuffle_epi8(v3, mask);
+            let tmp0 = _mm_unpacklo_epi32(v0, v1);
+            let tmp1 = _mm_unpackhi_epi32(v0, v1);
+            let tmp2 = _mm_unpacklo_epi32(v2, v3);
+            let tmp3 = _mm_unpackhi_epi32(v2, v3);
+            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+            self.combine_u16x16(
+                self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
+                self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
+            )
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        let (v01, v23) = self.split_u16x32(a);
+        let (v0, v1) = self.split_u16x16(v01);
+        let (v2, v3) = self.split_u16x16(v23);
+        let v0 = v0.into();
+        let v1 = v1.into();
+        let v2 = v2.into();
+        let v3 = v3.into();
+        unsafe {
+            let tmp0 = _mm_unpacklo_epi32(v0, v1);
+            let tmp1 = _mm_unpackhi_epi32(v0, v1);
+            let tmp2 = _mm_unpacklo_epi32(v2, v3);
+            let tmp3 = _mm_unpackhi_epi32(v2, v3);
+            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+            let out0 = _mm_shuffle_epi8(out0, mask);
+            let out1 = _mm_shuffle_epi8(out1, mask);
+            let out2 = _mm_shuffle_epi8(out2, mask);
+            let out3 = _mm_shuffle_epi8(out3, mask);
+            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
+            _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
+            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
+            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
+        }
+    }
+    #[inline(always)]
+    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
+        unsafe { _mm512_cvtepi16_epi8(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
+        mask16x32 {
+            val: (if val { 4294967295u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 32usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask16x32 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        mask16x32 {
+            val: (bits & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        u64::from((a).val) & 4294967295u64
+    }
+    #[inline(always)]
+    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((!u64::from((a).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask16x32(
+        self,
+        a: mask16x32<Self>,
+        b: mask16x32<Self>,
+        c: mask16x32<Self>,
+    ) -> mask16x32<Self> {
+        mask16x32 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 4294967295u64
+    }
+    #[inline(always)]
+    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 4294967295u64
+    }
+    #[inline(always)]
+    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask16x16 {
+                val: (bits & 65535u64) as _,
+                simd: self,
+            },
+            mask16x16 {
+                val: ((bits >> 16usize) & 65535u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
+        unsafe { _mm512_set1_epi32(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+        unsafe { core::mem::transmute::<__m512i, [i32; 16usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
+        unsafe { core::mem::transmute::<&__m512i, &[i32; 16usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
+        unsafe { core::mem::transmute::<&mut __m512i, &mut [i32; 16usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const i32,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
+        unsafe {
+            i32x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_i32x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_i32x16(b).val.0,
+            );
+            self.cvt_from_bytes_i32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x16<const SHIFT: usize>(
+        self,
+        a: i32x16<Self>,
+        b: i32x16<Self>,
+    ) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(
+            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        unsafe {
+            _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        unsafe {
+            _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_srav_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpeq_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmplt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmple_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpge_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpgt_epi32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_min_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_max_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        unsafe { _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_cvtepi32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+        unsafe { _mm512_set1_epi32(val.cast_signed()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
+        unsafe { core::mem::transmute::<__m512i, [u32; 16usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
+        unsafe { core::mem::transmute::<&__m512i, &[u32; 16usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
+        unsafe { core::mem::transmute::<&mut __m512i, &mut [u32; 16usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const u32,
+                dest.as_mut_ptr(),
+                16usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
+        unsafe {
+            u32x16 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 4usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_u32x16(a).val.0,
+                idx,
+                self.cvt_to_bytes_u32x16(b).val.0,
+            );
+            self.cvt_from_bytes_u32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x16<const SHIFT: usize>(
+        self,
+        a: u32x16<Self>,
+        b: u32x16<Self>,
+    ) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(
+            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_add_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_sub_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_mullo_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_and_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_or_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_xor_si512(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        unsafe {
+            _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_sllv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        unsafe {
+            _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_srlv_epi32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpeq_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmplt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmple_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpge_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        unsafe {
+            mask32x16 {
+                val: _mm512_cmpgt_epu32_mask(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe {
+            _mm512_permutex2var_epi32(
+                a.into(),
+                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b,
+                )
+                .simd_into(self),
+                _mm512_permutex2var_epi32(
+                    a,
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b,
+                )
+                .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_min_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        unsafe { _mm512_max_epu32(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        unsafe {
+            (
+                _mm512_castsi512_si256(a.into()).simd_into(self),
+                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
+        unsafe {
+            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
+            let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
+            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
+            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
+            let tmp0 = _mm_unpacklo_epi32(v0, v1);
+            let tmp1 = _mm_unpackhi_epi32(v0, v1);
+            let tmp2 = _mm_unpacklo_epi32(v2, v3);
+            let tmp3 = _mm_unpackhi_epi32(v2, v3);
+            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+            self.combine_u32x8(
+                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
+                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
+            )
+        }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        let (v01, v23) = self.split_u32x16(a);
+        let (v0, v1) = self.split_u32x8(v01);
+        let (v2, v3) = self.split_u32x8(v23);
+        let v0 = v0.into();
+        let v1 = v1.into();
+        let v2 = v2.into();
+        let v3 = v3.into();
+        unsafe {
+            let tmp0 = _mm_unpacklo_epi32(v0, v1);
+            let tmp1 = _mm_unpackhi_epi32(v0, v1);
+            let tmp2 = _mm_unpacklo_epi32(v2, v3);
+            let tmp3 = _mm_unpackhi_epi32(v2, v3);
+            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
+            _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
+            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
+            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        __m512i::from(a).simd_into(self)
+    }
+    #[inline(always)]
+    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
+        unsafe { _mm512_cvtepu32_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
+        mask32x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 16usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask32x16 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        mask32x16 {
+            val: (bits & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
+    }
+    #[inline(always)]
+    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask32x16(
+        self,
+        a: mask32x16<Self>,
+        b: mask32x16<Self>,
+        c: mask32x16<Self>,
+    ) -> mask32x16<Self> {
+        mask32x16 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
+    }
+    #[inline(always)]
+    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
+    }
+    #[inline(always)]
+    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask32x8 {
+                val: (bits & 255u64) as _,
+                simd: self,
+            },
+            mask32x8 {
+                val: ((bits >> 8usize) & 255u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
+        unsafe { _mm512_set1_pd(val).simd_into(self) }
+    }
+    #[inline(always)]
+    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
+            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
+        unsafe { core::mem::transmute::<__m512d, [f64; 8usize]>(a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
+        unsafe { core::mem::transmute::<&__m512d, &[f64; 8usize]>(&a.val.0) }
+    }
+    #[inline(always)]
+    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
+        unsafe { core::mem::transmute::<&mut __m512d, &mut [f64; 8usize]>(&mut a.val.0) }
+    }
+    #[inline(always)]
+    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                (&raw const a.val.0) as *const f64,
+                dest.as_mut_ptr(),
+                8usize,
+            );
+        }
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
+        unsafe {
+            f64x8 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+        unsafe {
+            u8x64 {
+                val: core::mem::transmute(a.val),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let idx = _mm512_add_epi8(
+                _mm512_set_epi8(
+                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
+                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
+                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+                    1, 0,
+                ),
+                _mm512_set1_epi8((SHIFT * 8usize) as i8),
+            );
+            let result = _mm512_permutex2var_epi8(
+                self.cvt_to_bytes_f64x8(a).val.0,
+                idx,
+                self.cvt_to_bytes_f64x8(b).val.0,
+            );
+            self.cvt_from_bytes_f64x8(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f64x8<const SHIFT: usize>(
+        self,
+        a: f64x8<Self>,
+        b: f64x8<Self>,
+    ) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_sqrt_pd(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_add_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_sub_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_mul_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_div_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            let mask = _mm512_set1_pd(-0.0);
+            _mm512_or_pd(
+                _mm512_and_pd(mask, b.into()),
+                _mm512_andnot_pd(mask, a.into()),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        unsafe {
+            mask64x8 {
+                val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                simd: self,
+            }
+        }
+    }
+    #[inline(always)]
+    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            _mm512_permutex2var_pd(
+                a.into(),
+                _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15),
+                b.into(),
+            )
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    .simd_into(self),
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        unsafe {
+            let a = a.into();
+            let b = b.into();
+            (
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    .simd_into(self),
+                _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    .simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_max_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_min_pd(a.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            let intermediate = _mm512_max_pd(a.into(), b.into());
+            let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into());
+            _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        unsafe {
+            let intermediate = _mm512_min_pd(a.into(), b.into());
+            let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into());
+            _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
+    }
+    #[inline(always)]
+    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.round_ties_even_f64x4(a0),
+            self.round_ties_even_f64x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        a - self.trunc_f64x8(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
+    }
+    #[inline(always)]
+    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        unsafe { _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        unsafe {
+            (
+                _mm512_castpd512_pd256(a.into()).simd_into(self),
+                _mm512_extractf64x4_pd::<1>(a.into()).simd_into(self),
+            )
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
+        unsafe { _mm512_castpd_ps(a.into()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn splat_mask64x8(self, val: bool) -> mask64x8<Self> {
+        mask64x8 {
+            val: (if val { 255u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
+        let val = &val;
+        let mut bits = 0u64;
+        let mut i = 0usize;
+        while i < 8usize {
+            if val[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        mask64x8 {
+            val: (bits) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
+        let bits = u64::from((a).val);
+        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
+        mask64x8 {
+            val: (bits & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
+        u64::from((a).val) & 255u64
+    }
+    #[inline(always)]
+    fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: ((!u64::from((a).val)) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask64x8(
+        self,
+        a: mask64x8<Self>,
+        b: mask64x8<Self>,
+        c: mask64x8<Self>,
+    ) -> mask64x8<Self> {
+        mask64x8 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 255u64
+    }
+    #[inline(always)]
+    fn any_false_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 255u64
+    }
+    #[inline(always)]
+    fn all_false_mask64x8(self, a: mask64x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask64x4 {
+                val: (bits & 15u64) as _,
+                simd: self,
+            },
+            mask64x4 {
+                val: ((bits >> 4usize) & 15u64) as _,
+                simd: self,
+            },
+        )
+    }
+}
+impl<S: Simd> SimdFrom<__mmask16, S> for mask8x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask16) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask8x16<S>> for __mmask16 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask8x16<S>) -> Self {
+        value.to_bitmask() as __mmask16
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask16x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask16x8<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask16x8<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask32x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask32x4<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask32x4<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask64x2<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask64x2<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask32, S> for mask8x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask32) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask8x32<S>> for __mmask32 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask8x32<S>) -> Self {
+        value.to_bitmask() as __mmask32
+    }
+}
+impl<S: Simd> SimdFrom<__mmask16, S> for mask16x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask16) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask16x16<S>> for __mmask16 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask16x16<S>) -> Self {
+        value.to_bitmask() as __mmask16
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask32x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask32x8<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask32x8<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask64x4<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask64x4<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+impl<S: Simd> SimdFrom<__m512, S> for f32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<f32x16<S>> for __m512 {
+    #[inline(always)]
+    fn from(value: f32x16<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for i8x64<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i8x64<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: i8x64<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for u8x64<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u8x64<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: u8x64<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__mmask64, S> for mask8x64<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask64) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask8x64<S>> for __mmask64 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask8x64<S>) -> Self {
+        value.to_bitmask() as __mmask64
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for i16x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i16x32<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: i16x32<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for u16x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u16x32<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: u16x32<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__mmask32, S> for mask16x32<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask32) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask16x32<S>> for __mmask32 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask16x32<S>) -> Self {
+        value.to_bitmask() as __mmask32
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for i32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i32x16<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: i32x16<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for u32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u32x16<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: u32x16<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__mmask16, S> for mask32x16<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask16) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask32x16<S>> for __mmask16 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask32x16<S>) -> Self {
+        value.to_bitmask() as __mmask16
+    }
+}
+impl<S: Simd> SimdFrom<__m512d, S> for f64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512d) -> Self {
+        Self {
+            val: unsafe { crate::support::checked_transmute_copy(&arch) },
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<f64x8<S>> for __m512d {
+    #[inline(always)]
+    fn from(value: f64x8<S>) -> Self {
+        unsafe { crate::support::checked_transmute_copy(&value.val) }
+    }
+}
+impl<S: Simd> SimdFrom<__mmask8, S> for mask64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __mmask8) -> Self {
+        Self::from_bitmask(simd, u64::from(arch))
+    }
+}
+impl<S: Simd> From<mask64x8<S>> for __mmask8 {
+    #[inline(always)]
+    #[allow(
+        trivial_numeric_casts,
+        reason = "generated uniformly for all __mmask widths"
+    )]
+    fn from(value: mask64x8<S>) -> Self {
+        value.to_bitmask() as __mmask8
+    }
+}
+#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+#[doc = r" Rust doesn't currently let you do math on const generics."]
+#[inline(always)]
+unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
+    unsafe {
+        match shift {
+            0usize => _mm_alignr_epi8::<0i32>(a, b),
+            1usize => _mm_alignr_epi8::<1i32>(a, b),
+            2usize => _mm_alignr_epi8::<2i32>(a, b),
+            3usize => _mm_alignr_epi8::<3i32>(a, b),
+            4usize => _mm_alignr_epi8::<4i32>(a, b),
+            5usize => _mm_alignr_epi8::<5i32>(a, b),
+            6usize => _mm_alignr_epi8::<6i32>(a, b),
+            7usize => _mm_alignr_epi8::<7i32>(a, b),
+            8usize => _mm_alignr_epi8::<8i32>(a, b),
+            9usize => _mm_alignr_epi8::<9i32>(a, b),
+            10usize => _mm_alignr_epi8::<10i32>(a, b),
+            11usize => _mm_alignr_epi8::<11i32>(a, b),
+            12usize => _mm_alignr_epi8::<12i32>(a, b),
+            13usize => _mm_alignr_epi8::<13i32>(a, b),
+            14usize => _mm_alignr_epi8::<14i32>(a, b),
+            15usize => _mm_alignr_epi8::<15i32>(a, b),
+            _ => unreachable!(),
+        }
+    }
+}

From 81441cfd85bc277129397eaa6c13137a77483f7c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 18:53:01 +0100
Subject: [PATCH 06/55] Fix build after file rename

---
 fearless_simd_tests/tests/harness/lm_generated.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs
index 3e30f814e..a7d381969 100644
--- a/fearless_simd_tests/tests/harness/lm_generated.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated.rs
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 mod extended_512;
-mod mask_methods;
+mod mask_roundtrip;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod mask_roundtrip_x86;
 mod mod_256;

From 0d6af5d5a9d84c22b49ff284d9aac7164a048905 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 18:58:16 +0100
Subject: [PATCH 07/55] Use AVX-512 instructions for f32 -> u32 conversions.
 Expand test coverage for these ops.

---
 fearless_simd/src/generated/avx512.rs         | 69 +++++------------
 fearless_simd_gen/src/mk_x86.rs               | 74 +++++++++++--------
 .../tests/harness/lm_generated/mod_256.rs     | 49 ++++++++++++
 3 files changed, 108 insertions(+), 84 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 3b8bd1af9..644598de1 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -374,37 +374,19 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
-        unsafe {
-            let mut converted = _mm_cvttps_epi32(a.into());
-            let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0));
-            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
-            if !all_in_range {
-                let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0));
-                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
-                converted = _mm_add_epi32(converted, excess_converted);
-            }
-            converted.simd_into(self)
-        }
+        unsafe { _mm_cvttps_epu32(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
         unsafe {
             let a = _mm_max_ps(a.into(), _mm_setzero_ps());
-            let mut converted = _mm_cvttps_epi32(a);
-            let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
-            let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
-            if !all_in_range {
-                let exceeds_unsigned_range =
-                    _mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a));
-                let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0));
-                let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
-                converted = _mm_add_epi32(converted, excess_converted);
-                converted = _mm_blendv_epi8(
-                    converted,
-                    _mm_set1_epi32(u32::MAX.cast_signed()),
-                    exceeds_unsigned_range,
-                );
-            }
+            let mut converted = _mm_cvttps_epu32(a);
+            let exceeds_unsigned_range = _mm_cmp_ps_mask::<17i32>(_mm_set1_ps(4294967040.0), a);
+            converted = _mm_mask_blend_epi32(
+                exceeds_unsigned_range,
+                converted,
+                _mm_set1_epi32(u32::MAX.cast_signed()),
+            );
             converted.simd_into(self)
         }
     }
@@ -2964,37 +2946,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            let mut converted = _mm256_cvttps_epi32(a.into());
-            let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0));
-            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
-            if !all_in_range {
-                let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0));
-                let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
-                converted = _mm256_add_epi32(converted, excess_converted);
-            }
-            converted.simd_into(self)
-        }
+        unsafe { _mm256_cvttps_epu32(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
         unsafe {
             let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
-            let mut converted = _mm256_cvttps_epi32(a);
-            let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
-            let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
-            if !all_in_range {
-                let exceeds_unsigned_range =
-                    _mm256_castps_si256(_mm256_cmp_ps::<17i32>(_mm256_set1_ps(4294967040.0), a));
-                let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0));
-                let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
-                converted = _mm256_add_epi32(converted, excess_converted);
-                converted = _mm256_blendv_epi8(
-                    converted,
-                    _mm256_set1_epi32(u32::MAX.cast_signed()),
-                    exceeds_unsigned_range,
-                );
-            }
+            let mut converted = _mm256_cvttps_epu32(a);
+            let exceeds_unsigned_range =
+                _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a);
+            converted = _mm256_mask_blend_epi32(
+                exceeds_unsigned_range,
+                converted,
+                _mm256_set1_epi32(u32::MAX.cast_signed()),
+            );
             converted.simd_into(self)
         }
     }
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 3c35b249e..a95088530 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -2369,6 +2369,48 @@ impl X86 {
             vec_ty.scalar_bits, target_scalar_bits,
             "we currently only support converting between types of the same width"
         );
+        if *self == Self::Avx512
+            && vec_ty.scalar == ScalarType::Float
+            && target_scalar == ScalarType::Unsigned
+        {
+            let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
+            let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits());
+            let expr = if precise {
+                let max = simple_intrinsic("max", vec_ty);
+                let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
+                let blend = avx512_mask_blend_intrinsic(&target_ty);
+                let set1_float = set1_intrinsic(vec_ty);
+                let set1_int = set1_intrinsic(&target_ty);
+                let set0_float = intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits());
+                let lt = avx512_float_compare_predicate("simd_lt");
+                quote! {
+                    unsafe {
+                        let a = #max(a.into(), #set0_float());
+                        let mut converted = #convert(a);
+                        let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a);
+                        converted = #blend(
+                            exceeds_unsigned_range,
+                            converted,
+                            #set1_int(u32::MAX.cast_signed()),
+                        );
+                        converted.simd_into(self)
+                    }
+                }
+            } else {
+                quote! {
+                    unsafe {
+                        #convert(a.into()).simd_into(self)
+                    }
+                }
+            };
+
+            return quote! {
+                #method_sig {
+                    #expr
+                }
+            };
+        }
+
         if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
             let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
             let expr = match (vec_ty.scalar, target_scalar) {
@@ -2402,38 +2444,6 @@ impl X86 {
                         }
                     }
                 }
-                (ScalarType::Float, ScalarType::Unsigned) => {
-                    let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits());
-                    if precise {
-                        let max = simple_intrinsic("max", vec_ty);
-                        let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
-                        let blend = avx512_mask_blend_intrinsic(&target_ty);
-                        let set1_float = set1_intrinsic(vec_ty);
-                        let set1_int = set1_intrinsic(&target_ty);
-                        let set0_float =
-                            intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits());
-                        let lt = avx512_float_compare_predicate("simd_lt");
-                        quote! {
-                            unsafe {
-                                let a = #max(a.into(), #set0_float());
-                                let mut converted = #convert(a);
-                                let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a);
-                                converted = #blend(
-                                    exceeds_unsigned_range,
-                                    converted,
-                                    #set1_int(u32::MAX.cast_signed()),
-                                );
-                                converted.simd_into(self)
-                            }
-                        }
-                    } else {
-                        quote! {
-                            unsafe {
-                                #convert(a.into()).simd_into(self)
-                            }
-                        }
-                    }
-                }
                 (ScalarType::Int, ScalarType::Float) => {
                     let intrinsic = simple_intrinsic("cvtepi32", &target_ty);
                     quote! {
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
index 01363baca..797f54f64 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
@@ -316,6 +316,55 @@ fn trunc_f32x8_special_values<S: Simd>(simd: S) {
     }
 }
 
+#[simd_test]
+fn cvt_u32_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(simd, &[1.0, 42.7, 3e9, -0.3, 0.0, 17.9, 255.99, 1024.1]);
+    assert_eq!(
+        *a.to_int::<u32x8<_>>(),
+        [1, 42, 3000000000, 0, 0, 17, 255, 1024]
+    );
+}
+
+#[simd_test]
+fn cvt_u32_precise_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(
+        simd,
+        &[-1.0, 42.7, 5e9, f32::NAN, 0.0, 1.9, 3000000000.0, -5e9],
+    );
+    assert_eq!(
+        *a.to_int_precise::<u32x8<_>>(),
+        [0, 42, u32::MAX, 0, 0, 1, 3000000000, 0]
+    );
+}
+
+#[simd_test]
+fn cvt_u32_f32x8_rounding<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(simd, &[0.0, 0.49, 0.51, 0.99, 1.01, 1.99, 2.5, 3.75]);
+    assert_eq!(*a.to_int::<u32x8<_>>(), [0, 0, 0, 0, 1, 1, 2, 3]);
+}
+
+#[simd_test]
+fn cvt_u32_precise_f32x8_inf<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(
+        simd,
+        &[
+            -10.3,
+            f32::NAN,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            u32::MAX as f32,
+            4294967040.0,
+            4294967296.0,
+            -0.5,
+        ],
+    );
+
+    assert_eq!(
+        *a.to_int_precise::<u32x8<_>>(),
+        [0, 0, u32::MAX, u32::MIN, u32::MAX, 4294967040, u32::MAX, 0]
+    );
+}
+
 #[simd_test]
 fn select_f32x8<S: Simd>(simd: S) {
     let a = f32x8::from_slice(simd, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);

From 025c17298018bf49058faa41026ba8e698288a4a Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 19:16:29 +0100
Subject: [PATCH 08/55] Optimize load_array/as_array on AVX-512 masks; the
 initial impl was scalar, now we use the dedicated intrinsics.

---
 fearless_simd/src/generated/avx512.rs | 264 +++++++++++---------------
 fearless_simd_gen/src/mk_x86.rs       |  36 ++--
 2 files changed, 129 insertions(+), 171 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 644598de1..986ea6f93 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -936,24 +936,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 16usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask8x16 {
+                val: _mm_movepi8_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask8x16 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm_movm_epi8(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
@@ -1516,24 +1512,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 8usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask16x8 {
+                val: _mm_movepi16_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask16x8 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm_movm_epi16(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
@@ -2106,24 +2098,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 4usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask32x4 {
+                val: _mm_movepi32_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask32x4 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm_movm_epi32(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
@@ -2486,24 +2474,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 2usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask64x2 {
+                val: _mm_movepi64_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask64x2 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm_movm_epi64(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
@@ -3711,24 +3695,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 32usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask8x32 {
+                val: _mm256_movepi8_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask8x32 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm256_movm_epi8(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
@@ -4467,24 +4447,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 16usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask16x16 {
+                val: _mm256_movepi16_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask16x16 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm256_movm_epi16(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
@@ -5204,24 +5180,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 8usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask32x8 {
+                val: _mm256_movepi32_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask32x8 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm256_movm_epi32(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
@@ -5653,24 +5625,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 4usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask64x4 {
+                val: _mm256_movepi64_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask64x4 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm256_movm_epi64(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
@@ -7029,24 +6997,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 64usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask8x64 {
+                val: _mm512_movepi8_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask8x64 {
-            val: bits,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm512_movm_epi8(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
@@ -7872,24 +7836,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 32usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask16x32 {
+                val: _mm512_movepi16_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask16x32 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm512_movm_epi16(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
@@ -8657,24 +8617,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 16usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask32x16 {
+                val: _mm512_movepi32_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask32x16 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm512_movm_epi32(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
@@ -9114,24 +9070,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
-        let val = &val;
-        let mut bits = 0u64;
-        let mut i = 0usize;
-        while i < 8usize {
-            if val[i] != 0 {
-                bits |= 1u64 << i;
+        unsafe {
+            let lanes = crate::support::checked_transmute_copy(&val);
+            mask64x8 {
+                val: _mm512_movepi64_mask(lanes),
+                simd: self,
             }
-            i += 1;
-        }
-        mask64x8 {
-            val: (bits) as _,
-            simd: self,
         }
     }
     #[inline(always)]
     fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
-        let bits = u64::from((a).val);
-        core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+        unsafe {
+            let lanes = _mm512_movm_epi64(a.val);
+            crate::support::checked_transmute_copy(&lanes)
+        }
     }
     #[inline(always)]
     fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index a95088530..c9f34b133 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -928,25 +928,25 @@ impl X86 {
         kind: crate::ops::RefKind,
     ) -> TokenStream {
         assert_eq!(vec_ty.scalar, ScalarType::Mask);
-        let len = vec_ty.len;
-        let val_ref = if kind == crate::ops::RefKind::Value {
+        let movepi_mask = intrinsic_ident(
+            &format!("movepi{}", vec_ty.scalar_bits),
+            "mask",
+            vec_ty.n_bits(),
+        );
+        let transmute_src = if kind == crate::ops::RefKind::Value {
             quote! { &val }
         } else {
             quote! { val }
         };
-        let result = avx512_mask_value(vec_ty, quote! { bits });
+        // Mask arrays are specified as either 0 or -1 per lane, so the sign bit is the
+        // truth value. Other lane values have unspecified results.
+        let result = avx512_mask_register_value(vec_ty, quote! { #movepi_mask(lanes) });
         quote! {
             #method_sig {
-                let val = #val_ref;
-                let mut bits = 0u64;
-                let mut i = 0usize;
-                while i < #len {
-                    if val[i] != 0 {
-                        bits |= 1u64 << i;
-                    }
-                    i += 1;
+                unsafe {
+                    let lanes = crate::support::checked_transmute_copy(#transmute_src);
+                    #result
                 }
-                #result
             }
         }
     }
@@ -962,11 +962,17 @@ impl X86 {
             kind == crate::ops::RefKind::Value,
             "mask array references are not exposed"
         );
-        let bits = avx512_mask_bits_expr(quote! { a });
+        let movm = intrinsic_ident(
+            "movm",
+            op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
+            vec_ty.n_bits(),
+        );
         quote! {
             #method_sig {
-                let bits = #bits;
-                core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 })
+                unsafe {
+                    let lanes = #movm(a.val);
+                    crate::support::checked_transmute_copy(&lanes)
+                }
             }
         }
     }

From 79273836c5e628a36140ea4dceab3730d3f45b0f Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 19:29:22 +0100
Subject: [PATCH 09/55] Split set_mask into a backend method so it could be
 specialized per backend, and specialize it for AVX-512. Add test coverage
 that sets every single bit and verifies it was set correctly.

---
 fearless_simd/src/generated/avx2.rs           | 132 +++++++++++++
 fearless_simd/src/generated/avx512.rs         | 180 ++++++++++++++++++
 fearless_simd/src/generated/fallback.rs       | 132 +++++++++++++
 fearless_simd/src/generated/neon.rs           | 132 +++++++++++++
 fearless_simd/src/generated/simd_trait.rs     |  24 +++
 fearless_simd/src/generated/simd_types.rs     | 108 ++---------
 fearless_simd/src/generated/sse4_2.rs         | 132 +++++++++++++
 fearless_simd/src/generated/wasm.rs           | 132 +++++++++++++
 fearless_simd_gen/src/generic.rs              |  22 +++
 fearless_simd_gen/src/mk_fallback.rs          |   5 +-
 fearless_simd_gen/src/mk_neon.rs              |   5 +-
 fearless_simd_gen/src/mk_simd_types.rs        |  13 +-
 fearless_simd_gen/src/mk_wasm.rs              |   3 +-
 fearless_simd_gen/src/mk_x86.rs               |  31 ++-
 fearless_simd_gen/src/ops.rs                  |  20 +-
 .../harness/lm_generated/mask_roundtrip.rs    |  80 ++++++++
 16 files changed, 1037 insertions(+), 114 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 2c2dfa5aa..40d2c7d8c 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -919,6 +919,17 @@ impl Simd for Avx2 {
         unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1430,6 +1441,17 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1946,6 +1968,17 @@ impl Simd for Avx2 {
         unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -2267,6 +2300,17 @@ impl Simd for Avx2 {
         unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -3377,6 +3421,17 @@ impl Simd for Avx2 {
         unsafe { _mm256_movemask_epi8(a.into()) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -4093,6 +4148,17 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -4746,6 +4812,17 @@ impl Simd for Avx2 {
         unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -5142,6 +5219,17 @@ impl Simd for Avx2 {
         unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -6292,6 +6380,17 @@ impl Simd for Avx2 {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -7048,6 +7147,17 @@ impl Simd for Avx2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7779,6 +7889,17 @@ impl Simd for Avx2 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8228,6 +8349,17 @@ impl Simd for Avx2 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 986ea6f93..7511cd8a8 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -963,6 +963,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 65535u64
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x16 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         mask8x16 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
@@ -1539,6 +1554,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 255u64
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask16x8 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         mask16x8 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
@@ -2125,6 +2155,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 15u64
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask32x4 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         mask32x4 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
@@ -2501,6 +2546,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 3u64
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x2 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         mask64x2 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _,
@@ -3722,6 +3782,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 4294967295u64
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x32 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         mask8x32 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
@@ -4474,6 +4549,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 65535u64
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask16x16 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         mask16x16 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
@@ -5207,6 +5297,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 255u64
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask32x8 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         mask32x8 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
@@ -5652,6 +5757,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 15u64
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x4 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         mask64x4 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
@@ -7024,6 +7144,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & u64::MAX
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x64 {
+            val: bits,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         mask8x64 {
             val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX,
@@ -7863,6 +7998,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 4294967295u64
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask16x32 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         mask16x32 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
@@ -8644,6 +8794,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 65535u64
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask32x16 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         mask32x16 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
@@ -9097,6 +9262,21 @@ impl Simd for Avx512 {
         u64::from((a).val) & 255u64
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((*a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x8 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         mask64x8 {
             val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index 5bb26fa05..43e06eb19 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -1841,6 +1841,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         [
             i8::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -3006,6 +3017,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         [
             i16::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -3863,6 +3885,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         [
             i32::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -4280,6 +4313,17 @@ impl Simd for Fallback {
         bits
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         [
             i64::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -5281,6 +5325,17 @@ impl Simd for Fallback {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -5942,6 +5997,17 @@ impl Simd for Fallback {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -6583,6 +6649,17 @@ impl Simd for Fallback {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -7005,6 +7082,17 @@ impl Simd for Fallback {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -8094,6 +8182,17 @@ impl Simd for Fallback {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -8783,6 +8882,17 @@ impl Simd for Fallback {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -9436,6 +9546,17 @@ impl Simd for Fallback {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -9844,6 +9965,17 @@ impl Simd for Fallback {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index ca5486cbc..2eaccf475 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -816,6 +816,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { vandq_s8(a.into(), b.into()).simd_into(self) }
     }
@@ -1319,6 +1330,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { vandq_s16(a.into(), b.into()).simd_into(self) }
     }
@@ -1826,6 +1848,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { vandq_s32(a.into(), b.into()).simd_into(self) }
     }
@@ -2150,6 +2183,17 @@ impl Simd for Neon {
         }
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { vandq_s64(a.into(), b.into()).simd_into(self) }
     }
@@ -3252,6 +3296,17 @@ impl Simd for Neon {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -3993,6 +4048,17 @@ impl Simd for Neon {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4727,6 +4793,17 @@ impl Simd for Neon {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5199,6 +5276,17 @@ impl Simd for Neon {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6373,6 +6461,17 @@ impl Simd for Neon {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -7145,6 +7244,17 @@ impl Simd for Neon {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7899,6 +8009,17 @@ impl Simd for Neon {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8371,6 +8492,17 @@ impl Simd for Neon {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 4bde9b4e3..1ecd25438 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -402,6 +402,8 @@ pub trait Simd:
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -605,6 +607,8 @@ pub trait Simd:
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -810,6 +814,8 @@ pub trait Simd:
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -941,6 +947,8 @@ pub trait Simd:
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1258,6 +1266,8 @@ pub trait Simd:
     fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1469,6 +1479,8 @@ pub trait Simd:
     fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1680,6 +1692,8 @@ pub trait Simd:
     fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1815,6 +1829,8 @@ pub trait Simd:
     fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2134,6 +2150,8 @@ pub trait Simd:
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2343,6 +2361,8 @@ pub trait Simd:
     fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2552,6 +2572,8 @@ pub trait Simd:
     fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2683,6 +2705,8 @@ pub trait Simd:
     fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self>;
     #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
     fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64;
+    #[doc = "Set one logical lane of a SIMD mask."]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> ();
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 335490fd6..c05fa1b73 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -688,14 +688,7 @@ impl<S: Simd> SimdMask<S> for mask8x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 16,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16
-        );
-        let mut lanes = self.simd.as_array_mask8x16(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask8x16(lanes);
+        self.simd.set_mask8x16(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
@@ -1156,14 +1149,7 @@ impl<S: Simd> SimdMask<S> for mask16x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 8,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8
-        );
-        let mut lanes = self.simd.as_array_mask16x8(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask16x8(lanes);
+        self.simd.set_mask16x8(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
@@ -1648,14 +1634,7 @@ impl<S: Simd> SimdMask<S> for mask32x4<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 4,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4
-        );
-        let mut lanes = self.simd.as_array_mask32x4(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask32x4(lanes);
+        self.simd.set_mask32x4(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
@@ -1985,14 +1964,7 @@ impl<S: Simd> SimdMask<S> for mask64x2<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 2,
-            "mask lane index {index} is out of bounds for {} lanes",
-            2
-        );
-        let mut lanes = self.simd.as_array_mask64x2(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask64x2(lanes);
+        self.simd.set_mask64x2(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
@@ -2727,14 +2699,7 @@ impl<S: Simd> SimdMask<S> for mask8x32<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 32,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32
-        );
-        let mut lanes = self.simd.as_array_mask8x32(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask8x32(lanes);
+        self.simd.set_mask8x32(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
@@ -3221,14 +3186,7 @@ impl<S: Simd> SimdMask<S> for mask16x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 16,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16
-        );
-        let mut lanes = self.simd.as_array_mask16x16(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask16x16(lanes);
+        self.simd.set_mask16x16(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
@@ -3727,14 +3685,7 @@ impl<S: Simd> SimdMask<S> for mask32x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 8,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8
-        );
-        let mut lanes = self.simd.as_array_mask32x8(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask32x8(lanes);
+        self.simd.set_mask32x8(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
@@ -4071,14 +4022,7 @@ impl<S: Simd> SimdMask<S> for mask64x4<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 4,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4
-        );
-        let mut lanes = self.simd.as_array_mask64x4(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask64x4(lanes);
+        self.simd.set_mask64x4(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
@@ -4801,14 +4745,7 @@ impl<S: Simd> SimdMask<S> for mask8x64<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 64,
-            "mask lane index {index} is out of bounds for {} lanes",
-            64
-        );
-        let mut lanes = self.simd.as_array_mask8x64(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask8x64(lanes);
+        self.simd.set_mask8x64(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
@@ -5283,14 +5220,7 @@ impl<S: Simd> SimdMask<S> for mask16x32<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 32,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32
-        );
-        let mut lanes = self.simd.as_array_mask16x32(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask16x32(lanes);
+        self.simd.set_mask16x32(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
@@ -5789,14 +5719,7 @@ impl<S: Simd> SimdMask<S> for mask32x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 16,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16
-        );
-        let mut lanes = self.simd.as_array_mask32x16(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask32x16(lanes);
+        self.simd.set_mask32x16(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
@@ -6127,14 +6050,7 @@ impl<S: Simd> SimdMask<S> for mask64x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(
-            index < 8,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8
-        );
-        let mut lanes = self.simd.as_array_mask64x8(*self);
-        lanes[index] = if value { !0 } else { 0 };
-        *self = self.simd.load_array_mask64x8(lanes);
+        self.simd.set_mask64x8(self, index, value);
     }
     #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index a2d90513e..a2cf7f67b 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -959,6 +959,17 @@ impl Simd for Sse4_2 {
         unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1479,6 +1490,17 @@ impl Simd for Sse4_2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -2004,6 +2026,17 @@ impl Simd for Sse4_2 {
         unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -2331,6 +2364,17 @@ impl Simd for Sse4_2 {
         unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 }
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -3367,6 +3411,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -4066,6 +4121,17 @@ impl Simd for Sse4_2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4754,6 +4820,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5202,6 +5279,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6381,6 +6469,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -7129,6 +7228,17 @@ impl Simd for Sse4_2 {
         }
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7844,6 +7954,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8278,6 +8399,17 @@ impl Simd for Sse4_2 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index faeffed9e..6ace3b9c1 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -869,6 +869,17 @@ impl Simd for WasmSimd128 {
         i8x16_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask8x16(self, a: &mut mask8x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask8x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -1369,6 +1380,17 @@ impl Simd for WasmSimd128 {
         i16x8_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask16x8(self, a: &mut mask16x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask16x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -1873,6 +1895,17 @@ impl Simd for WasmSimd128 {
         i32x4_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask32x4(self, a: &mut mask32x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask32x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -2223,6 +2256,17 @@ impl Simd for WasmSimd128 {
         i64x2_bitmask(a.into()) as u64
     }
     #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -3266,6 +3310,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -3961,6 +4016,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4649,6 +4715,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5097,6 +5174,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 2usize)
     }
     #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6247,6 +6335,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 32usize)
     }
     #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -6977,6 +7076,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 16usize)
     }
     #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7689,6 +7799,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 8usize)
     }
     #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8123,6 +8244,17 @@ impl Simd for WasmSimd128 {
         lo | (hi << 4usize)
     }
     #[inline(always)]
+    fn set_mask64x8(self, a: &mut mask64x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask64x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index 233ad6ffa..c4a11ee9e 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -208,6 +208,9 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream {
                 }
             }
         }
+        OpSig::MaskSet => {
+            panic!("Mask set must operate on the full mask vector")
+        }
         OpSig::LoadInterleaved {
             block_size,
             block_count,
@@ -509,3 +512,22 @@ pub(crate) fn generic_mask_to_bitmask(method_sig: TokenStream, vec_ty: &VecType)
         }
     }
 }
+
+pub(crate) fn generic_mask_set(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+    let from_array = generic_op_name("load_array", vec_ty);
+    let as_array = generic_op_name("as_array", vec_ty);
+    let len = vec_ty.len;
+
+    quote! {
+        #method_sig {
+            assert!(
+                index < #len,
+                "mask lane index {index} is out of bounds for {} lanes",
+                #len
+            );
+            let mut lanes = self.#as_array(*a);
+            lanes[index] = if value { !0 } else { 0 };
+            *a = self.#from_array(lanes);
+        }
+    }
+}
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index 70122a9e7..92099258a 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -3,8 +3,8 @@
 
 use crate::arch::fallback;
 use crate::generic::{
-    generic_from_bytes, generic_mask_from_bitmask, generic_mask_to_bitmask, generic_op_name,
-    generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_from_bytes, generic_mask_from_bitmask, generic_mask_set, generic_mask_to_bitmask,
+    generic_op_name, generic_to_bytes, integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, OpSig, RefKind, valid_reinterpret};
@@ -466,6 +466,7 @@ impl Level for Fallback {
             }
             OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 9765c06df..a1fb02993 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -5,8 +5,8 @@ use proc_macro2::{Ident, Literal, Span, TokenStream};
 use quote::{ToTokens as _, format_ident, quote};
 
 use crate::generic::{
-    generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_store_array,
-    generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_as_array, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name,
+    generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, SlideGranularity, valid_reinterpret};
@@ -532,6 +532,7 @@ impl Level for Neon {
             }
             OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs
index 3960e3281..b6f2aafce 100644
--- a/fearless_simd_gen/src/mk_simd_types.rs
+++ b/fearless_simd_gen/src/mk_simd_types.rs
@@ -298,6 +298,7 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
     let splat = generic_op_name("splat", ty);
     let from_bitmask_op = generic_op_name("from_bitmask", ty);
     let to_bitmask_op = generic_op_name("to_bitmask", ty);
+    let set_op = generic_op_name("set", ty);
     let from_array_op = generic_op_name("load_array", ty);
     let as_array_op = generic_op_name("as_array", ty);
     let mut methods = vec![];
@@ -322,9 +323,6 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
         }
     }
 
-    // Current backends store masks as signed integer lanes, so `set` uses a generic
-    // spill/update/reload path. Future compact predicate backends such as AVX-512 can
-    // switch this implementation to `to_bitmask`/`from_bitmask`.
     quote! {
         impl<S: Simd> SimdMask<S> for #name<S> {
             type Element = #scalar;
@@ -352,14 +350,7 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
 
             #[inline(always)]
             fn set(&mut self, index: usize, value: bool) {
-                assert!(
-                    index < #len,
-                    "mask lane index {index} is out of bounds for {} lanes",
-                    #len
-                );
-                let mut lanes = self.simd.#as_array_op(*self);
-                lanes[index] = if value { !0 } else { 0 };
-                *self = self.simd.#from_array_op(lanes);
+                self.simd.#set_op(self, index, value);
             }
 
             #[inline(always)]
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 8c4e2eceb..af73d3202 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -7,7 +7,7 @@ use quote::{format_ident, quote};
 use crate::arch::wasm::{arch_prefix, v128_intrinsic};
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
-    generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes,
+    generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
     integer_lane_mask_splat_arg, scalar_binary,
 };
 use crate::level::Level;
@@ -594,6 +594,7 @@ impl Level for WasmSimd128 {
             }
             OpSig::MaskFromBitmask => mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index c9f34b133..ee2845837 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -8,7 +8,7 @@ use crate::arch::x86::{
 };
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
-    generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes,
+    generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
     integer_lane_mask_splat_arg, scalar_binary,
 };
 use crate::level::Level;
@@ -314,6 +314,10 @@ impl Level for X86 {
             } => self.handle_mask_reduce(method_sig, vec_ty, quantifier, condition),
             OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskSet if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask => {
+                self.handle_avx512_mask_set(method_sig, vec_ty)
+            }
+            OpSig::MaskSet => generic_mask_set(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
@@ -977,6 +981,31 @@ impl X86 {
         }
     }
 
+    pub(crate) fn handle_avx512_mask_set(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+    ) -> TokenStream {
+        assert_eq!(vec_ty.scalar, ScalarType::Mask);
+        let len = vec_ty.len;
+        let bits = avx512_mask_bits_expr(quote! { *a });
+        let result = avx512_mask_value(vec_ty, quote! { bits });
+
+        quote! {
+            #method_sig {
+                assert!(
+                    index < #len,
+                    "mask lane index {index} is out of bounds for {} lanes",
+                    #len
+                );
+                let bit = 1u64 << index;
+                let bits = #bits;
+                let bits = if value { bits | bit } else { bits & !bit };
+                *a = #result;
+            }
+        }
+    }
+
     pub(crate) fn handle_mask_from_bitmask(
         &self,
         method_sig: TokenStream,
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 2e3e7b24b..dd9cc7f65 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -110,6 +110,8 @@ pub(crate) enum OpSig {
     MaskFromBitmask,
     /// Takes a mask vector type and returns its compact bitmask representation.
     MaskToBitmask,
+    /// Takes a mutable mask vector, a lane index, and a boolean, and updates the lane in place.
+    MaskSet,
     /// Takes an argument of an array of a certain scalar type, with the length (`block_size` * `block_count`) / [scalar
     /// type's byte size]. Returns a vector type of that scalar type and length.
     ///
@@ -277,6 +279,12 @@ impl Op {
                 let arg0 = &arg_names[0];
                 quote! { (self, #arg0: #ty<Self>) -> u64 }
             }
+            OpSig::MaskSet => {
+                let arg0 = &arg_names[0];
+                let arg1 = &arg_names[1];
+                let arg2 = &arg_names[2];
+                quote! { (self, #arg0: &mut #ty<Self>, #arg1: usize, #arg2: bool) -> () }
+            }
             OpSig::Shift => {
                 let arg0 = &arg_names[0];
                 let arg1 = &arg_names[1];
@@ -353,7 +361,7 @@ impl Op {
             OpSig::LoadInterleaved { .. } | OpSig::StoreInterleaved { .. } | OpSig::StoreArray => {
                 return None;
             }
-            OpSig::MaskFromBitmask | OpSig::MaskToBitmask => return None,
+            OpSig::MaskFromBitmask | OpSig::MaskToBitmask | OpSig::MaskSet => return None,
             OpSig::Unary
             | OpSig::Cvt { .. }
             | OpSig::Reinterpret { .. }
@@ -583,6 +591,12 @@ const MASK_REPRESENTATION_OPS: &[Op] = &[
         OpSig::MaskToBitmask,
         "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared.",
     ),
+    Op::new(
+        "set",
+        OpKind::AssociatedOnly,
+        OpSig::MaskSet,
+        "Set one logical lane of a SIMD mask.",
+    ),
 ];
 
 const FLOAT_OPS: &[Op] = &[
@@ -1511,6 +1525,7 @@ impl OpSig {
                 | Self::FromArray { .. }
                 | Self::AsArray { .. }
                 | Self::StoreArray
+                | Self::MaskSet
                 | Self::Slide {
                     granularity: SlideGranularity::AcrossBlocks,
                     ..
@@ -1540,6 +1555,7 @@ impl OpSig {
         match self {
             Self::Splat | Self::FromArray { .. } => &["val"],
             Self::MaskFromBitmask => &["bits"],
+            Self::MaskSet => &["a", "index", "value"],
             Self::Unary
             | Self::Split { .. }
             | Self::Cvt { .. }
@@ -1572,6 +1588,7 @@ impl OpSig {
             | Self::FromArray { .. }
             | Self::MaskFromBitmask
             | Self::MaskToBitmask
+            | Self::MaskSet
             | Self::FromBytes { .. }
             | Self::StoreArray => &[],
             Self::Unary
@@ -1634,6 +1651,7 @@ impl OpSig {
             | Self::Shift
             | Self::MaskFromBitmask
             | Self::MaskToBitmask
+            | Self::MaskSet
             | Self::LoadInterleaved { .. }
             | Self::StoreInterleaved { .. }
             | Self::FromArray { .. }
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
index 15963b2a3..ecc6f3c52 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
@@ -4,6 +4,86 @@
 use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;
 
+/// Verifies that `SimdMask::set` can set and clear every lane while keeping
+/// `to_bitmask` and `test` in sync with the expected compact bitmask.
+fn assert_mask_set_roundtrip<S: Simd, M: SimdMask<S>>(simd: S) {
+    let mut mask = M::from_bitmask(simd, 0);
+    let mut expected = 0u64;
+    for i in 0..M::N {
+        mask.set(i, true);
+        expected |= 1u64 << i;
+        assert_eq!(mask.to_bitmask(), expected);
+        assert!(mask.test(i));
+    }
+
+    for i in 0..M::N {
+        mask.set(i, false);
+        expected &= !(1u64 << i);
+        assert_eq!(mask.to_bitmask(), expected);
+        assert!(!mask.test(i));
+    }
+}
+
+#[simd_test]
+fn mask8x16_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask8x16<S>>(simd);
+}
+
+#[simd_test]
+fn mask16x8_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask16x8<S>>(simd);
+}
+
+#[simd_test]
+fn mask32x4_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask32x4<S>>(simd);
+}
+
+#[simd_test]
+fn mask64x2_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask64x2<S>>(simd);
+}
+
+#[simd_test]
+fn mask8x32_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask8x32<S>>(simd);
+}
+
+#[simd_test]
+fn mask16x16_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask16x16<S>>(simd);
+}
+
+#[simd_test]
+fn mask32x8_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask32x8<S>>(simd);
+}
+
+#[simd_test]
+fn mask64x4_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask64x4<S>>(simd);
+}
+
+#[simd_test]
+fn mask8x64_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask8x64<S>>(simd);
+}
+
+#[simd_test]
+fn mask16x32_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask16x32<S>>(simd);
+}
+
+#[simd_test]
+fn mask32x16_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask32x16<S>>(simd);
+}
+
+#[simd_test]
+fn mask64x8_set_roundtrip<S: Simd>(simd: S) {
+    assert_mask_set_roundtrip::<S, mask64x8<S>>(simd);
+}
+
 #[simd_test]
 fn mask8x16_bitmask_roundtrip<S: Simd>(simd: S) {
     for bits in 0..=0xffff_u64 {

From 57de1298dedaf59b1eeea9e557e8e970b0a42e10 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 20:15:05 +0100
Subject: [PATCH 10/55] Optimize load_interleaved/store_interleaved for
 AVX-512. Add one more test to exercise it.

i8/u8 test is still bad because of https://github.com/rust-lang/rust/issues/156891
---
 fearless_simd/src/generated/avx512.rs    | 220 +++++++----------------
 fearless_simd_gen/src/mk_x86.rs          |  95 ++++++++++
 fearless_simd_tests/tests/harness/mod.rs |  50 ++++++
 3 files changed, 205 insertions(+), 160 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 7511cd8a8..c18ddf916 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -6220,46 +6220,22 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
         unsafe {
-            let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
-            let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
-            let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
-            let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
-            let tmp0 = _mm_unpacklo_ps(v0, v1);
-            let tmp1 = _mm_unpackhi_ps(v0, v1);
-            let tmp2 = _mm_unpacklo_ps(v2, v3);
-            let tmp3 = _mm_unpackhi_ps(v2, v3);
-            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            self.combine_f32x8(
-                self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
+            let lanes = _mm512_loadu_ps(src.as_ptr() as *const _);
+            _mm512_permutexvar_ps(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                lanes,
             )
+            .simd_into(self)
         }
     }
     #[inline(always)]
     fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        let (v01, v23) = self.split_f32x16(a);
-        let (v0, v1) = self.split_f32x8(v01);
-        let (v2, v3) = self.split_f32x8(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_ps(v0, v1);
-            let tmp1 = _mm_unpackhi_ps(v0, v1);
-            let tmp2 = _mm_unpacklo_ps(v2, v3);
-            let tmp3 = _mm_unpackhi_ps(v2, v3);
-            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
-            _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
-            _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
+        unsafe {
+            let lanes = _mm512_permutexvar_ps(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                a.into(),
+            );
+            _mm512_storeu_ps(dest.as_mut_ptr() as *mut _, lanes);
         }
     }
     #[inline(always)]
@@ -7052,56 +7028,32 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
         unsafe {
-            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
-            let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
-            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
-            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
-            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            let v0 = _mm_shuffle_epi8(v0, mask);
-            let v1 = _mm_shuffle_epi8(v1, mask);
-            let v2 = _mm_shuffle_epi8(v2, mask);
-            let v3 = _mm_shuffle_epi8(v3, mask);
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u8x32(
-                self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
+            let lanes = _mm512_loadu_si512(src.as_ptr() as *const _);
+            _mm512_permutexvar_epi8(
+                _mm512_set_epi8(
+                    63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54, 50,
+                    46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, 61, 57, 53, 49, 45, 41, 37, 33,
+                    29, 25, 21, 17, 13, 9, 5, 1, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16,
+                    12, 8, 4, 0,
+                ),
+                lanes,
             )
+            .simd_into(self)
         }
     }
     #[inline(always)]
     fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
-        let (v01, v23) = self.split_u8x64(a);
-        let (v0, v1) = self.split_u8x32(v01);
-        let (v2, v3) = self.split_u8x32(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            let out0 = _mm_shuffle_epi8(out0, mask);
-            let out1 = _mm_shuffle_epi8(out1, mask);
-            let out2 = _mm_shuffle_epi8(out2, mask);
-            let out3 = _mm_shuffle_epi8(out3, mask);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
+        unsafe {
+            let lanes = _mm512_permutexvar_epi8(
+                _mm512_set_epi8(
+                    63, 47, 31, 15, 62, 46, 30, 14, 61, 45, 29, 13, 60, 44, 28, 12, 59, 43, 27, 11,
+                    58, 42, 26, 10, 57, 41, 25, 9, 56, 40, 24, 8, 55, 39, 23, 7, 54, 38, 22, 6, 53,
+                    37, 21, 5, 52, 36, 20, 4, 51, 35, 19, 3, 50, 34, 18, 2, 49, 33, 17, 1, 48, 32,
+                    16, 0,
+                ),
+                a.into(),
+            );
+            _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes);
         }
     }
     #[inline(always)]
@@ -7898,56 +7850,28 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
         unsafe {
-            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
-            let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
-            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
-            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
-            let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-            let v0 = _mm_shuffle_epi8(v0, mask);
-            let v1 = _mm_shuffle_epi8(v1, mask);
-            let v2 = _mm_shuffle_epi8(v2, mask);
-            let v3 = _mm_shuffle_epi8(v3, mask);
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u16x16(
-                self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
+            let lanes = _mm512_loadu_si512(src.as_ptr() as *const _);
+            _mm512_permutexvar_epi16(
+                _mm512_set_epi16(
+                    31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17, 13,
+                    9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0,
+                ),
+                lanes,
             )
+            .simd_into(self)
         }
     }
     #[inline(always)]
     fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
-        let (v01, v23) = self.split_u16x32(a);
-        let (v0, v1) = self.split_u16x16(v01);
-        let (v2, v3) = self.split_u16x16(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-            let out0 = _mm_shuffle_epi8(out0, mask);
-            let out1 = _mm_shuffle_epi8(out1, mask);
-            let out2 = _mm_shuffle_epi8(out2, mask);
-            let out3 = _mm_shuffle_epi8(out3, mask);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
+        unsafe {
+            let lanes = _mm512_permutexvar_epi16(
+                _mm512_set_epi16(
+                    31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, 27, 19, 11, 3, 26,
+                    18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0,
+                ),
+                a.into(),
+            );
+            _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes);
         }
     }
     #[inline(always)]
@@ -8708,46 +8632,22 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
         unsafe {
-            let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
-            let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
-            let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
-            let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u32x8(
-                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
+            let lanes = _mm512_loadu_si512(src.as_ptr() as *const _);
+            _mm512_permutexvar_epi32(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                lanes,
             )
+            .simd_into(self)
         }
     }
     #[inline(always)]
     fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        let (v01, v23) = self.split_u32x16(a);
-        let (v0, v1) = self.split_u32x8(v01);
-        let (v2, v3) = self.split_u32x8(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
+        unsafe {
+            let lanes = _mm512_permutexvar_epi32(
+                _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                a.into(),
+            );
+            _mm512_storeu_si512(dest.as_mut_ptr() as *mut _, lanes);
         }
     }
     #[inline(always)]
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index ee2845837..f14bbd269 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -791,6 +791,11 @@ fn avx512_permutex2var_intrinsic(vec_ty: &VecType) -> Ident {
     intrinsic_ident("permutex2var", suffix, vec_ty.n_bits())
 }
 
+fn avx512_permutexvar_intrinsic(vec_ty: &VecType) -> Ident {
+    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+    intrinsic_ident("permutexvar", suffix, vec_ty.n_bits())
+}
+
 fn avx512_mask_blend_intrinsic(vec_ty: &VecType) -> Ident {
     let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
     intrinsic_ident("mask_blend", suffix, vec_ty.n_bits())
@@ -863,6 +868,20 @@ fn avx512_index_vector(vec_ty: &VecType, indices: impl IntoIterator<Item = usize
     }
 }
 
+fn interleaved_load_indices(len: usize, block_count: usize) -> Vec<usize> {
+    let stream_len = len / block_count;
+    (0..block_count)
+        .flat_map(|stream| (0..stream_len).map(move |i| i * block_count + stream))
+        .collect()
+}
+
+fn interleaved_store_indices(len: usize, block_count: usize) -> Vec<usize> {
+    let stream_len = len / block_count;
+    (0..stream_len)
+        .flat_map(|i| (0..block_count).map(move |stream| stream * stream_len + i))
+        .collect()
+}
+
 impl X86 {
     pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
         if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
@@ -2810,6 +2829,14 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
+            return self.handle_avx512_load_interleaved(
+                method_sig,
+                vec_ty,
+                block_size,
+                block_count,
+            );
+        }
         let expr = match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
@@ -2928,6 +2955,36 @@ impl X86 {
         }
     }
 
+    pub(crate) fn handle_avx512_load_interleaved(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        block_size: u16,
+        block_count: u16,
+    ) -> TokenStream {
+        assert_eq!(
+            block_size, 128,
+            "only 128-bit blocks are currently supported"
+        );
+        assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        assert_eq!(vec_ty.n_bits(), 512);
+        let load_unaligned = intrinsic_ident("loadu", coarse_type(vec_ty), vec_ty.n_bits());
+        let permute = avx512_permutexvar_intrinsic(vec_ty);
+        let indices = avx512_index_vector(
+            vec_ty,
+            interleaved_load_indices(vec_ty.len, block_count as usize),
+        );
+
+        quote! {
+            #method_sig {
+                unsafe {
+                    let lanes = #load_unaligned(src.as_ptr() as *const _);
+                    #permute(#indices, lanes).simd_into(self)
+                }
+            }
+        }
+    }
+
     pub(crate) fn handle_store_interleaved(
         &self,
         method_sig: TokenStream,
@@ -2940,6 +2997,14 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
+            return self.handle_avx512_store_interleaved(
+                method_sig,
+                vec_ty,
+                block_size,
+                block_count,
+            );
+        }
         let expr = match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
@@ -3059,6 +3124,36 @@ impl X86 {
         }
     }
 
+    pub(crate) fn handle_avx512_store_interleaved(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+        block_size: u16,
+        block_count: u16,
+    ) -> TokenStream {
+        assert_eq!(
+            block_size, 128,
+            "only 128-bit blocks are currently supported"
+        );
+        assert_eq!(block_count, 4, "only count of 4 is currently supported");
+        assert_eq!(vec_ty.n_bits(), 512);
+        let store_unaligned = intrinsic_ident("storeu", coarse_type(vec_ty), vec_ty.n_bits());
+        let permute = avx512_permutexvar_intrinsic(vec_ty);
+        let indices = avx512_index_vector(
+            vec_ty,
+            interleaved_store_indices(vec_ty.len, block_count as usize),
+        );
+
+        quote! {
+            #method_sig {
+                unsafe {
+                    let lanes = #permute(#indices, a.into());
+                    #store_unaligned(dest.as_mut_ptr() as *mut _, lanes);
+                }
+            }
+        }
+    }
+
     /// Generates versions of the "alignr" intrinsics that take the shift amount as a regular argument instead of a
     /// const generic argument, to make them easier to use in higher-level operations. These are low-level helpers that
     /// inherit the semantics of the underlying `alignr` intrinsics, so the argument order is backwards from ARM's
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index ca482799a..ac116afb1 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -839,6 +839,56 @@ fn all_false_mask8x16<S: Simd>(simd: S) {
     assert!(!simd.all_false_mask8x16(one_neg));
 }
 
+#[simd_test]
+fn load_interleaved_128_f32x16<S: Simd>(simd: S) {
+    let data: [f32; 16] = [
+        0.0,
+        f32::NAN,
+        f32::INFINITY,
+        -3.0,
+        4.0,
+        -0.0,
+        6.0,
+        f32::NEG_INFINITY,
+        8.0,
+        9.0,
+        -10.0,
+        11.0,
+        f32::MIN,
+        13.0,
+        f32::MAX,
+        15.0,
+    ];
+    let result = simd.load_interleaved_128_f32x16(&data);
+
+    let expected = [
+        0.0,
+        4.0,
+        8.0,
+        f32::MIN,
+        f32::NAN,
+        -0.0,
+        9.0,
+        13.0,
+        f32::INFINITY,
+        6.0,
+        -10.0,
+        f32::MAX,
+        -3.0,
+        f32::NEG_INFINITY,
+        11.0,
+        15.0,
+    ];
+
+    // Note: f32::NAN != f32::NAN hence we transmute to compare the bit pattern
+    unsafe {
+        assert_eq!(
+            std::mem::transmute::<[f32; 16], [u32; 16]>(*result),
+            std::mem::transmute::<[f32; 16], [u32; 16]>(expected)
+        );
+    }
+}
+
 #[simd_test]
 fn load_interleaved_128_u32x16<S: Simd>(simd: S) {
     #[rustfmt::skip]

From 2630928b263f1e52f56738e55deb5dd5fb6922a5 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 20:36:36 +0100
Subject: [PATCH 11/55] Optimize variable shifts for 8-bit and 16-bit types.
 Expand test coverage. Only for 8-bit left shift LLVM autovectorizes the
 scalar fallback into GFNI instructions on 256-bit halves which emits more
 instructions but schedules better and ends up being slightly faster according
 to llvm-mca on sapphire rapids; but the difference isn't huge and I don't
 want to rely on autovectorization because of its fragility.

---
 fearless_simd/src/generated/avx512.rs         | 204 +++++++++++++++---
 fearless_simd_gen/src/mk_x86.rs               |  61 ++++++
 .../harness/lm_generated/extended_512.rs      | 108 ++++++++++
 fearless_simd_tests/tests/harness/mod.rs      | 133 ++++++++++++
 4 files changed, 482 insertions(+), 24 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index c18ddf916..f80743ad7 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -547,7 +547,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = zero;
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
@@ -563,7 +576,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = _mm_cmpgt_epi8(zero, val);
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_srav_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_srav_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
@@ -806,7 +832,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = zero;
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
@@ -822,7 +861,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm_setzero_si128();
+            let value_extend = zero;
+            let lo_values = _mm_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm_set1_epi16(0x00ff);
+            let lo_shifted = _mm_and_si128(_mm_srlv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm_and_si128(_mm_srlv_epi16(hi_values, hi_counts), byte_mask);
+            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
@@ -1171,7 +1223,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
@@ -1179,7 +1231,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe { _mm_srav_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
@@ -1405,7 +1457,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
@@ -1413,7 +1465,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe { _mm_srlv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
@@ -3178,7 +3230,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = zero;
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
@@ -3194,7 +3259,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = _mm256_cmpgt_epi8(zero, val);
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
@@ -3538,7 +3616,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = zero;
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
@@ -3554,7 +3645,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm256_setzero_si256();
+            let value_extend = zero;
+            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm256_set1_epi16(0x00ff);
+            let lo_shifted = _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask);
+            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
@@ -4018,7 +4122,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
@@ -4028,7 +4132,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe { _mm256_srav_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
@@ -4331,7 +4435,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
@@ -4341,7 +4445,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe { _mm256_srlv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
@@ -6437,7 +6541,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = zero;
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
@@ -6459,7 +6576,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val));
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
@@ -6815,7 +6945,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = zero;
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
@@ -6831,7 +6974,20 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe {
+            let val = a.into();
+            let counts = b.into();
+            let zero = _mm512_setzero_si512();
+            let value_extend = zero;
+            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+            let byte_mask = _mm512_set1_epi16(0x00ff);
+            let lo_shifted = _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask);
+            let hi_shifted = _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask);
+            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
+        }
     }
     #[inline(always)]
     fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
@@ -7326,7 +7482,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
@@ -7336,7 +7492,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe { _mm512_srav_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
@@ -7659,7 +7815,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
@@ -7669,7 +7825,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        unsafe { _mm512_srlv_epi16(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index f14bbd269..20f5d8879 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -1580,6 +1580,13 @@ impl X86 {
                     }
                 }
             }
+            "shlv" | "shrv"
+                if *self == Self::Avx512
+                    && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned)
+                    && matches!(vec_ty.scalar_bits, 8 | 16) =>
+            {
+                self.handle_avx512_narrow_variable_shift(method, vec_ty)
+            }
             "shlv" | "shrv"
                 if matches!(self, Self::Avx2 | Self::Avx512) && vec_ty.scalar_bits >= 32 =>
             {
@@ -1614,6 +1621,60 @@ impl X86 {
         }
     }
 
+    fn handle_avx512_narrow_variable_shift(&self, method: &str, vec_ty: &VecType) -> TokenStream {
+        assert!(*self == Self::Avx512);
+        assert!(matches!(vec_ty.scalar_bits, 8 | 16));
+        let name = match (method, vec_ty.scalar) {
+            ("shrv", ScalarType::Int) => "srav",
+            ("shrv", _) => "srlv",
+            ("shlv", _) => "sllv",
+            _ => unreachable!(),
+        };
+        let shift_intrinsic = intrinsic_ident(name, "epi16", vec_ty.n_bits());
+
+        if vec_ty.scalar_bits == 16 {
+            return quote! {
+                unsafe { #shift_intrinsic(a.into(), b.into()).simd_into(self) }
+            };
+        }
+
+        let ty_bits = vec_ty.n_bits();
+        let unpack_hi = unpack_intrinsic(ScalarType::Int, 8, false, ty_bits);
+        let unpack_lo = unpack_intrinsic(ScalarType::Int, 8, true, ty_bits);
+        let set0 = intrinsic_ident("setzero", coarse_type(vec_ty), ty_bits);
+        let and = intrinsic_ident("and", coarse_type(vec_ty), ty_bits);
+        let set1_epi16 = intrinsic_ident("set1", "epi16", ty_bits);
+        let pack = pack_intrinsic(16, false, ty_bits);
+        let value_extend = match (method, vec_ty.scalar) {
+            ("shlv", _) | (_, ScalarType::Unsigned) => quote! { zero },
+            ("shrv", ScalarType::Int) if ty_bits == 512 => {
+                quote! { _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val)) }
+            }
+            ("shrv", ScalarType::Int) => {
+                let cmpgt = intrinsic_ident("cmpgt", "epi8", ty_bits);
+                quote! { #cmpgt(zero, val) }
+            }
+            _ => unreachable!(),
+        };
+
+        quote! {
+            unsafe {
+                let val = a.into();
+                let counts = b.into();
+                let zero = #set0();
+                let value_extend = #value_extend;
+                let lo_values = #unpack_lo(val, value_extend);
+                let hi_values = #unpack_hi(val, value_extend);
+                let lo_counts = #unpack_lo(counts, zero);
+                let hi_counts = #unpack_hi(counts, zero);
+                let byte_mask = #set1_epi16(0x00ff);
+                let lo_shifted = #and(#shift_intrinsic(lo_values, lo_counts), byte_mask);
+                let hi_shifted = #and(#shift_intrinsic(hi_values, hi_counts), byte_mask);
+                #pack(lo_shifted, hi_shifted).simd_into(self)
+            }
+        }
+    }
+
     pub(crate) fn handle_shift(
         &self,
         method_sig: TokenStream,
diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
index 2de317d3e..bc5c93556 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
@@ -941,6 +941,114 @@ fn shl_u32x16<S: Simd>(simd: S) {
 }
 
 // Vector shift tests (shlv/shrv)
+#[simd_test]
+fn shlv_i8x64<S: Simd>(simd: S) {
+    const A: [i8; 16] = [64, 65, -64, -65, 1, 2, 3, 4, -1, -2, -3, -4, 15, 16, 31, 32];
+    const SHIFTS: [i8; 16] = [1, 2, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4, 3, 2, 1, 0];
+    const EXPECTED: [i8; 16] = [
+        -128, 4, -128, -4, 1, 4, 12, 32, -2, -8, -24, -64, 120, 64, 62, 32,
+    ];
+    let a_vals: [i8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [i8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [i8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = i8x64::from_slice(simd, &a_vals);
+    let shifts = i8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_i8x64<S: Simd>(simd: S) {
+    const A: [i8; 16] = [
+        -128, -64, -33, -1, 127, 64, 33, 1, -2, -4, -8, -16, 0, 2, 4, 8,
+    ];
+    const SHIFTS: [i8; 16] = [1, 2, 3, 7, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3];
+    const EXPECTED: [i8; 16] = [-64, -16, -5, -1, 63, 16, 4, 1, -1, -1, -1, -1, 0, 1, 1, 1];
+    let a_vals: [i8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [i8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [i8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = i8x64::from_slice(simd, &a_vals);
+    let shifts = i8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
+#[simd_test]
+fn shlv_u8x64<S: Simd>(simd: S) {
+    const A: [u8; 16] = [255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127];
+    const SHIFTS: [u8; 16] = [4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1];
+    const EXPECTED: [u8; 16] = [240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254];
+    let a_vals: [u8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [u8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [u8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = u8x64::from_slice(simd, &a_vals);
+    let shifts = u8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_u8x64<S: Simd>(simd: S) {
+    const A: [u8; 16] = [255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127];
+    const SHIFTS: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 1, 0, 1, 2, 3, 4, 3, 2, 1];
+    const EXPECTED: [u8; 16] = [127, 32, 8, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 3, 15, 63];
+    let a_vals: [u8; 64] = core::array::from_fn(|i| A[i % 16]);
+    let shift_vals: [u8; 64] = core::array::from_fn(|i| SHIFTS[i % 16]);
+    let expected: [u8; 64] = core::array::from_fn(|i| EXPECTED[i % 16]);
+    let a = u8x64::from_slice(simd, &a_vals);
+    let shifts = u8x64::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
+#[simd_test]
+fn shlv_i16x32<S: Simd>(simd: S) {
+    const A: [i16; 8] = [16384, 8192, -16384, -8192, 1, -1, 255, -256];
+    const SHIFTS: [i16; 8] = [1, 2, 1, 2, 15, 1, 4, 3];
+    const EXPECTED: [i16; 8] = [-32768, -32768, -32768, -32768, -32768, -2, 4080, -2048];
+    let a_vals: [i16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [i16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [i16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = i16x32::from_slice(simd, &a_vals);
+    let shifts = i16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_i16x32<S: Simd>(simd: S) {
+    const A: [i16; 8] = [-32768, -16384, -1025, -1, 32767, 16384, 1025, 1];
+    const SHIFTS: [i16; 8] = [1, 2, 3, 15, 1, 2, 3, 0];
+    const EXPECTED: [i16; 8] = [-16384, -4096, -129, -1, 16383, 4096, 128, 1];
+    let a_vals: [i16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [i16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [i16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = i16x32::from_slice(simd, &a_vals);
+    let shifts = i16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
+#[simd_test]
+fn shlv_u16x32<S: Simd>(simd: S) {
+    const A: [u16; 8] = [65535, 32768, 16384, 8192, 1, 255, 1024, 4096];
+    const SHIFTS: [u16; 8] = [4, 1, 2, 3, 15, 4, 5, 0];
+    const EXPECTED: [u16; 8] = [65520, 0, 0, 0, 32768, 4080, 32768, 4096];
+    let a_vals: [u16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [u16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [u16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = u16x32::from_slice(simd, &a_vals);
+    let shifts = u16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a << shifts), expected);
+}
+
+#[simd_test]
+fn shrv_u16x32<S: Simd>(simd: S) {
+    const A: [u16; 8] = [65535, 32768, 16384, 8192, 1, 255, 1024, 4096];
+    const SHIFTS: [u16; 8] = [1, 2, 3, 4, 0, 4, 5, 12];
+    const EXPECTED: [u16; 8] = [32767, 8192, 2048, 512, 1, 15, 32, 1];
+    let a_vals: [u16; 32] = core::array::from_fn(|i| A[i % 8]);
+    let shift_vals: [u16; 32] = core::array::from_fn(|i| SHIFTS[i % 8]);
+    let expected: [u16; 32] = core::array::from_fn(|i| EXPECTED[i % 8]);
+    let a = u16x32::from_slice(simd, &a_vals);
+    let shifts = u16x32::from_slice(simd, &shift_vals);
+    assert_eq!(*(a >> shifts), expected);
+}
+
 #[simd_test]
 fn shrv_i32x16<S: Simd>(simd: S) {
     let a = i32x16::from_slice(
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index ac116afb1..e357c5bb6 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -2497,6 +2497,139 @@ fn shlv_u32x4_varied<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn shlv_i8x16<S: Simd>(simd: S) {
+    let a = i8x16::from_slice(
+        simd,
+        &[64, 65, -64, -65, 1, 2, 3, 4, -1, -2, -3, -4, 15, 16, 31, 32],
+    );
+    let shifts = i8x16::from_slice(simd, &[1, 2, 1, 2, 0, 1, 2, 3, 1, 2, 3, 4, 3, 2, 1, 0]);
+    assert_eq!(
+        *(a << shifts),
+        [
+            -128, 4, -128, -4, 1, 4, 12, 32, -2, -8, -24, -64, 120, 64, 62, 32
+        ]
+    );
+}
+
+#[simd_test]
+fn shrv_i8x16<S: Simd>(simd: S) {
+    let a = i8x16::from_slice(
+        simd,
+        &[
+            -128, -64, -33, -1, 127, 64, 33, 1, -2, -4, -8, -16, 0, 2, 4, 8,
+        ],
+    );
+    let shifts = i8x16::from_slice(simd, &[1, 2, 3, 7, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3]);
+    assert_eq!(
+        *(a >> shifts),
+        [-64, -16, -5, -1, 63, 16, 4, 1, -1, -1, -1, -1, 0, 1, 1, 1]
+    );
+}
+
+#[simd_test]
+fn shlv_u8x16<S: Simd>(simd: S) {
+    let a = u8x16::from_slice(
+        simd,
+        &[255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127],
+    );
+    let shifts = u8x16::from_slice(simd, &[4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1]);
+    assert_eq!(
+        *(a << shifts),
+        [240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254]
+    );
+}
+
+#[simd_test]
+fn shrv_u8x16<S: Simd>(simd: S) {
+    let a = u8x16::from_slice(
+        simd,
+        &[255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127],
+    );
+    let shifts = u8x16::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 1, 0, 1, 2, 3, 4, 3, 2, 1]);
+    assert_eq!(
+        *(a >> shifts),
+        [127, 32, 8, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 3, 15, 63]
+    );
+}
+
+#[simd_test]
+fn shlv_i16x8<S: Simd>(simd: S) {
+    let a = i16x8::from_slice(simd, &[16384, 8192, -16384, -8192, 1, -1, 255, -256]);
+    let shifts = i16x8::from_slice(simd, &[1, 2, 1, 2, 15, 1, 4, 3]);
+    assert_eq!(
+        *(a << shifts),
+        [-32768, -32768, -32768, -32768, -32768, -2, 4080, -2048]
+    );
+}
+
+#[simd_test]
+fn shrv_i16x8<S: Simd>(simd: S) {
+    let a = i16x8::from_slice(simd, &[-32768, -16384, -1025, -1, 32767, 16384, 1025, 1]);
+    let shifts = i16x8::from_slice(simd, &[1, 2, 3, 15, 1, 2, 3, 0]);
+    assert_eq!(
+        *(a >> shifts),
+        [-16384, -4096, -129, -1, 16383, 4096, 128, 1]
+    );
+}
+
+#[simd_test]
+fn shlv_u16x8<S: Simd>(simd: S) {
+    let a = u16x8::from_slice(simd, &[65535, 32768, 16384, 8192, 1, 255, 1024, 4096]);
+    let shifts = u16x8::from_slice(simd, &[4, 1, 2, 3, 15, 4, 5, 0]);
+    assert_eq!(*(a << shifts), [65520, 0, 0, 0, 32768, 4080, 32768, 4096]);
+}
+
+#[simd_test]
+fn shrv_u16x8<S: Simd>(simd: S) {
+    let a = u16x8::from_slice(simd, &[65535, 32768, 16384, 8192, 1, 255, 1024, 4096]);
+    let shifts = u16x8::from_slice(simd, &[1, 2, 3, 4, 0, 4, 5, 12]);
+    assert_eq!(*(a >> shifts), [32767, 8192, 2048, 512, 1, 15, 32, 1]);
+}
+
+#[simd_test]
+fn shlv_u8x32<S: Simd>(simd: S) {
+    let a = u8x32::from_slice(
+        simd,
+        &[
+            255, 128, 64, 32, 16, 8, 4, 2, 1, 3, 5, 7, 15, 31, 63, 127, 255, 128, 64, 32, 16, 8, 4,
+            2, 1, 3, 5, 7, 15, 31, 63, 127,
+        ],
+    );
+    let shifts = u8x32::from_slice(
+        simd,
+        &[
+            4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 3, 2, 1, 4, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4,
+            3, 2, 1,
+        ],
+    );
+    assert_eq!(
+        *(a << shifts),
+        [
+            240, 0, 0, 0, 0, 0, 0, 0, 1, 6, 20, 56, 240, 248, 252, 254, 240, 0, 0, 0, 0, 0, 0, 0,
+            1, 6, 20, 56, 240, 248, 252, 254
+        ]
+    );
+}
+
+#[simd_test]
+fn shlv_u16x16<S: Simd>(simd: S) {
+    let a = u16x16::from_slice(
+        simd,
+        &[
+            65535, 32768, 16384, 8192, 1, 255, 1024, 4096, 65535, 32768, 16384, 8192, 1, 255, 1024,
+            4096,
+        ],
+    );
+    let shifts = u16x16::from_slice(simd, &[4, 1, 2, 3, 15, 4, 5, 0, 4, 1, 2, 3, 15, 4, 5, 0]);
+    assert_eq!(
+        *(a << shifts),
+        [
+            65520, 0, 0, 0, 32768, 4080, 32768, 4096, 65520, 0, 0, 0, 32768, 4080, 32768, 4096
+        ]
+    );
+}
+
 #[simd_test]
 fn add_i16x8<S: Simd>(simd: S) {
     let a = i16x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);

From f2ba8c93613e38a6baa27be75d93cc1e00e7ca1a Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 21:04:38 +0100
Subject: [PATCH 12/55] Optimize
 floor/ceil/round_ties_even/trunc/approximate_recip for 512-bit vectors on
 AVX-512; expand test coverage

---
 fearless_simd/src/generated/avx512.rs         | 66 ++++++++++---------
 fearless_simd_gen/src/mk_x86.rs               | 56 ++++++++++++----
 .../harness/lm_generated/extended_512.rs      | 55 ++++++++++++++++
 3 files changed, 134 insertions(+), 43 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index f80743ad7..0bdf94987 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -6077,11 +6077,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.approximate_recip_f32x8(a0),
-            self.approximate_recip_f32x8(a1),
-        )
+        unsafe { _mm512_rcp14_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
@@ -6275,21 +6271,24 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.round_ties_even_f32x8(a0),
-            self.round_ties_even_f32x8(a1),
-        )
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
@@ -6297,8 +6296,10 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+        unsafe {
+            _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
@@ -9058,11 +9059,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.approximate_recip_f64x4(a0),
-            self.approximate_recip_f64x4(a1),
-        )
+        unsafe { _mm512_rcp14_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
@@ -9240,21 +9237,24 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.round_ties_even_f64x4(a0),
-            self.round_ties_even_f64x4(a1),
-        )
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
@@ -9262,8 +9262,10 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
+        unsafe {
+            _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                .simd_into(self)
+        }
     }
     #[inline(always)]
     fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 20f5d8879..23498b125 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -243,17 +243,6 @@ impl Level for X86 {
     }
 
     fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool {
-        if *self == Self::Avx512
-            && vec_ty.scalar == ScalarType::Float
-            && vec_ty.n_bits() == 512
-            && matches!(
-                op.method,
-                "floor" | "ceil" | "round_ties_even" | "trunc" | "approximate_recip"
-            )
-        {
-            return true;
-        }
-
         if *self == Self::Avx512
             && matches!(
                 op.sig,
@@ -1291,6 +1280,51 @@ impl X86 {
             };
         }
 
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float && vec_ty.n_bits() == 512 {
+            let body = match method {
+                "floor" | "ceil" | "round_ties_even" | "trunc" => {
+                    let intrinsic = intrinsic_ident(
+                        "roundscale",
+                        op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
+                        512,
+                    );
+                    let rounding_mode = match method {
+                        "floor" => quote! { _MM_FROUND_TO_NEG_INF },
+                        "ceil" => quote! { _MM_FROUND_TO_POS_INF },
+                        "round_ties_even" => quote! { _MM_FROUND_TO_NEAREST_INT },
+                        "trunc" => quote! { _MM_FROUND_TO_ZERO },
+                        _ => unreachable!(),
+                    };
+                    quote! {
+                        unsafe {
+                            #intrinsic::<{ #rounding_mode | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                "approximate_recip" => {
+                    let intrinsic = intrinsic_ident(
+                        "rcp14",
+                        op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
+                        512,
+                    );
+                    quote! {
+                        unsafe {
+                            #intrinsic(a.into()).simd_into(self)
+                        }
+                    }
+                }
+                _ => TokenStream::new(),
+            };
+
+            if !body.is_empty() {
+                return quote! {
+                    #method_sig {
+                        #body
+                    }
+                };
+            }
+        }
+
         match method {
             "fract" => {
                 let trunc_op = generic_op_name("trunc", vec_ty);
diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
index bc5c93556..e06ccf099 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
@@ -512,6 +512,61 @@ fn fract_f32x16<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn fract_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[1.7, -2.3, 3.9, -4.1, 5.5, -6.6, 7.2, -8.8]);
+    let result = simd.fract_f64x8(a);
+    assert_eq!(
+        *result,
+        [
+            0.7,
+            -0.2999999999999998,
+            0.8999999999999999,
+            -0.09999999999999964,
+            0.5,
+            -0.5999999999999996,
+            0.20000000000000018,
+            -0.8000000000000007
+        ]
+    );
+}
+
+#[simd_test]
+fn approximate_recip_f32x16<S: Simd>(simd: S) {
+    let a = f32x16::from_slice(
+        simd,
+        &[
+            1.0, -2.0, 23.0, 9.0, 0.5, -0.25, 128.0, -1024.0, 3.0, -7.0, 11.0, -13.0, 19.0, -29.0,
+            37.0, -41.0,
+        ],
+    );
+    let result = a.approximate_recip();
+    for i in 0..16 {
+        let expected = 1.0 / a[i];
+        let rel_error = ((result[i] - expected) / expected).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
+#[simd_test]
+fn approximate_recip_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[1.0, -2.0, 23.0, 9.0, 0.5, -0.25, 128.0, -1024.0]);
+    let result = a.approximate_recip();
+    for i in 0..8 {
+        let expected = 1.0 / a[i];
+        let rel_error = ((result[i] - expected) / expected).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 // =============================================================================
 // max_precise and min_precise tests (512-bit floats)
 // =============================================================================

From 9cddbb2b3b3d97e1b59ff94b960eb2ef527db9d3 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 21:15:21 +0100
Subject: [PATCH 13/55] Use AVX-512 rcp14 for smaller vector sizes too;
 improves precision at no cost to throughput

---
 fearless_simd/src/generated/avx512.rs         |  8 +++----
 fearless_simd_gen/src/mk_x86.rs               |  8 +++----
 .../tests/harness/lm_generated/mod_256.rs     | 24 +++++++++++++++++++
 fearless_simd_tests/tests/harness/mod.rs      | 17 ++++++++++++-
 4 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 0bdf94987..2aff63e5e 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -193,7 +193,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
-        unsafe { _mm_rcp_ps(a.into()).simd_into(self) }
+        unsafe { _mm_rcp14_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
@@ -2395,7 +2395,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
-        1.0 / a
+        unsafe { _mm_rcp14_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
@@ -2798,7 +2798,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        unsafe { _mm256_rcp_ps(a.into()).simd_into(self) }
+        unsafe { _mm256_rcp14_ps(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
@@ -5615,7 +5615,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        1.0 / a
+        unsafe { _mm256_rcp14_pd(a.into()).simd_into(self) }
     }
     #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 23498b125..c8e7327ef 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -1280,13 +1280,13 @@ impl X86 {
             };
         }
 
-        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float && vec_ty.n_bits() == 512 {
+        if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float {
             let body = match method {
-                "floor" | "ceil" | "round_ties_even" | "trunc" => {
+                "floor" | "ceil" | "round_ties_even" | "trunc" if vec_ty.n_bits() == 512 => {
                     let intrinsic = intrinsic_ident(
                         "roundscale",
                         op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
-                        512,
+                        vec_ty.n_bits(),
                     );
                     let rounding_mode = match method {
                         "floor" => quote! { _MM_FROUND_TO_NEG_INF },
@@ -1305,7 +1305,7 @@ impl X86 {
                     let intrinsic = intrinsic_ident(
                         "rcp14",
                         op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
-                        512,
+                        vec_ty.n_bits(),
                     );
                     quote! {
                         unsafe {
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
index 797f54f64..7f33ebc6f 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
@@ -65,6 +65,30 @@ fn sqrt_f32x8<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn approximate_recip_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(simd, &[1.0, -2.0, 23.0, 9.0, 3.5, -7.25, 13.0, 0.25]);
+    let result = a.approximate_recip();
+    let expected = [
+        1.0,
+        -0.5,
+        1. / 23.,
+        1. / 9.,
+        1. / 3.5,
+        1. / -7.25,
+        1. / 13.,
+        4.0,
+    ];
+    for i in 0..8 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
 #[simd_test]
 fn div_f32x8<S: Simd>(simd: S) {
     let a = f32x8::from_slice(simd, &[4.0, 2.0, 1.0, 0.0, 10.0, 12.0, 15.0, 20.0]);
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index e357c5bb6..3716efbce 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -3412,10 +3412,25 @@ fn sqrt_f64x2<S: Simd>(simd: S) {
 
 #[simd_test]
 fn approximate_recip_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[1.0, -2.0]);
+    let result = a.approximate_recip();
+    let expected = [1.0, -0.5];
+    for i in 0..2 {
+        let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
+        assert!(
+            rel_error < 0.005,
+            "approximate_recip({}) rel_error = {rel_error}",
+            a[i]
+        );
+    }
+}
+
+#[simd_test]
+fn approximate_recip_f64x4<S: Simd>(simd: S) {
     let a = f64x4::from_slice(simd, &[1.0, -2.0, 23.0, 9.0]);
     let result = a.approximate_recip();
     let expected = [1.0, -0.5, 1. / 23., 1. / 9.];
-    for i in 0..2 {
+    for i in 0..4 {
         let rel_error = ((result[i] - expected[i]) / expected[i]).abs();
         assert!(
             rel_error < 0.005,

From 9d02c3a13093b5d2280c6361c39052cb87e3c4db Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 21:42:30 +0100
Subject: [PATCH 14/55] Optimize slide_within_blocks for AVX-512; verified with
 exhaustive slide test

---
 fearless_simd/src/generated/avx512.rs | 390 +++++++++++++++++++-------
 fearless_simd_gen/src/mk_x86.rs       |  37 ++-
 2 files changed, 327 insertions(+), 100 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 2aff63e5e..7bd4f5441 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -2777,12 +2777,21 @@ impl Simd for Avx512 {
         a: f32x8<Self>,
         b: f32x8<Self>,
     ) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f32x8(a).val.0;
+            let b = self.cvt_to_bytes_f32x8(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_f32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
@@ -3170,12 +3179,21 @@ impl Simd for Avx512 {
         a: i8x32<Self>,
         b: i8x32<Self>,
     ) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(
-            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i8x32(a).val.0;
+            let b = self.cvt_to_bytes_i8x32(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT);
+            self.cvt_from_bytes_i8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
@@ -3556,12 +3574,21 @@ impl Simd for Avx512 {
         a: u8x32<Self>,
         b: u8x32<Self>,
     ) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(
-            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u8x32(a).val.0;
+            let b = self.cvt_to_bytes_u8x32(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT);
+            self.cvt_from_bytes_u8x32(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
@@ -4079,12 +4106,21 @@ impl Simd for Avx512 {
         a: i16x16<Self>,
         b: i16x16<Self>,
     ) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(
-            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i16x16(a).val.0;
+            let b = self.cvt_to_bytes_i16x16(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_i16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
@@ -4392,12 +4428,21 @@ impl Simd for Avx512 {
         a: u16x16<Self>,
         b: u16x16<Self>,
     ) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(
-            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u16x16(a).val.0;
+            let b = self.cvt_to_bytes_u16x16(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_u16x16(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
@@ -4846,12 +4891,21 @@ impl Simd for Avx512 {
         a: i32x8<Self>,
         b: i32x8<Self>,
     ) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(
-            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i32x8(a).val.0;
+            let b = self.cvt_to_bytes_i32x8(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_i32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
@@ -5147,12 +5201,21 @@ impl Simd for Avx512 {
         a: u32x8<Self>,
         b: u32x8<Self>,
     ) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(
-            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u32x8(a).val.0;
+            let b = self.cvt_to_bytes_u32x8(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_u32x8(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
@@ -5594,12 +5657,21 @@ impl Simd for Avx512 {
         a: f64x4<Self>,
         b: f64x4<Self>,
     ) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 2usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f64x4(a).val.0;
+            let b = self.cvt_to_bytes_f64x4(b).val.0;
+            let result = dyn_alignr_256(b, a, SHIFT * 8usize);
+            self.cvt_from_bytes_f64x4(u8x32 {
+                val: crate::support::Aligned256(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
@@ -6056,12 +6128,21 @@ impl Simd for Avx512 {
         a: f32x16<Self>,
         b: f32x16<Self>,
     ) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f32x16(a).val.0;
+            let b = self.cvt_to_bytes_f32x16(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_f32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
@@ -6476,12 +6557,21 @@ impl Simd for Avx512 {
         a: i8x64<Self>,
         b: i8x64<Self>,
     ) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(
-            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i8x64(a).val.0;
+            let b = self.cvt_to_bytes_i8x64(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT);
+            self.cvt_from_bytes_i8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
@@ -6886,12 +6976,21 @@ impl Simd for Avx512 {
         a: u8x64<Self>,
         b: u8x64<Self>,
     ) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(
-            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 16usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u8x64(a).val.0;
+            let b = self.cvt_to_bytes_u8x64(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT);
+            self.cvt_from_bytes_u8x64(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
@@ -7440,12 +7539,21 @@ impl Simd for Avx512 {
         a: i16x32<Self>,
         b: i16x32<Self>,
     ) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i16x32(a).val.0;
+            let b = self.cvt_to_bytes_i16x32(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_i16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
@@ -7773,12 +7881,21 @@ impl Simd for Avx512 {
         a: u16x32<Self>,
         b: u16x32<Self>,
     ) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 8usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u16x32(a).val.0;
+            let b = self.cvt_to_bytes_u16x32(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 2usize);
+            self.cvt_from_bytes_u16x32(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
@@ -8266,12 +8383,21 @@ impl Simd for Avx512 {
         a: i32x16<Self>,
         b: i32x16<Self>,
     ) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(
-            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_i32x16(a).val.0;
+            let b = self.cvt_to_bytes_i32x16(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_i32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
@@ -8579,12 +8705,21 @@ impl Simd for Avx512 {
         a: u32x16<Self>,
         b: u32x16<Self>,
     ) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(
-            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 4usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_u32x16(a).val.0;
+            let b = self.cvt_to_bytes_u32x16(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 4usize);
+            self.cvt_from_bytes_u32x16(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
@@ -9038,12 +9173,21 @@ impl Simd for Avx512 {
         a: f64x8<Self>,
         b: f64x8<Self>,
     ) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
-        )
+        unsafe {
+            if SHIFT == 0 {
+                return a;
+            }
+            if SHIFT >= 2usize {
+                return b;
+            }
+            let a = self.cvt_to_bytes_f64x8(a).val.0;
+            let b = self.cvt_to_bytes_f64x8(b).val.0;
+            let result = dyn_alignr_512(b, a, SHIFT * 8usize);
+            self.cvt_from_bytes_f64x8(u8x64 {
+                val: crate::support::Aligned512(result),
+                simd: self,
+            })
+        }
     }
     #[inline(always)]
     fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
@@ -9757,3 +9901,57 @@ unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
         }
     }
 }
+#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+#[doc = r" Rust doesn't currently let you do math on const generics."]
+#[inline(always)]
+unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i {
+    unsafe {
+        match shift {
+            0usize => _mm256_alignr_epi8::<0i32>(a, b),
+            1usize => _mm256_alignr_epi8::<1i32>(a, b),
+            2usize => _mm256_alignr_epi8::<2i32>(a, b),
+            3usize => _mm256_alignr_epi8::<3i32>(a, b),
+            4usize => _mm256_alignr_epi8::<4i32>(a, b),
+            5usize => _mm256_alignr_epi8::<5i32>(a, b),
+            6usize => _mm256_alignr_epi8::<6i32>(a, b),
+            7usize => _mm256_alignr_epi8::<7i32>(a, b),
+            8usize => _mm256_alignr_epi8::<8i32>(a, b),
+            9usize => _mm256_alignr_epi8::<9i32>(a, b),
+            10usize => _mm256_alignr_epi8::<10i32>(a, b),
+            11usize => _mm256_alignr_epi8::<11i32>(a, b),
+            12usize => _mm256_alignr_epi8::<12i32>(a, b),
+            13usize => _mm256_alignr_epi8::<13i32>(a, b),
+            14usize => _mm256_alignr_epi8::<14i32>(a, b),
+            15usize => _mm256_alignr_epi8::<15i32>(a, b),
+            _ => unreachable!(),
+        }
+    }
+}
+#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+#[doc = r" Rust doesn't currently let you do math on const generics."]
+#[inline(always)]
+unsafe fn dyn_alignr_512(a: __m512i, b: __m512i, shift: usize) -> __m512i {
+    unsafe {
+        match shift {
+            0usize => _mm512_alignr_epi8::<0i32>(a, b),
+            1usize => _mm512_alignr_epi8::<1i32>(a, b),
+            2usize => _mm512_alignr_epi8::<2i32>(a, b),
+            3usize => _mm512_alignr_epi8::<3i32>(a, b),
+            4usize => _mm512_alignr_epi8::<4i32>(a, b),
+            5usize => _mm512_alignr_epi8::<5i32>(a, b),
+            6usize => _mm512_alignr_epi8::<6i32>(a, b),
+            7usize => _mm512_alignr_epi8::<7i32>(a, b),
+            8usize => _mm512_alignr_epi8::<8i32>(a, b),
+            9usize => _mm512_alignr_epi8::<9i32>(a, b),
+            10usize => _mm512_alignr_epi8::<10i32>(a, b),
+            11usize => _mm512_alignr_epi8::<11i32>(a, b),
+            12usize => _mm512_alignr_epi8::<12i32>(a, b),
+            13usize => _mm512_alignr_epi8::<13i32>(a, b),
+            14usize => _mm512_alignr_epi8::<14i32>(a, b),
+            15usize => _mm512_alignr_epi8::<15i32>(a, b),
+            _ => unreachable!(),
+        }
+    }
+}
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index c8e7327ef..cdb307440 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -251,6 +251,7 @@ impl Level for X86 {
                     ..
                 }
             )
+            && vec_ty.scalar == ScalarType::Mask
             && vec_ty.n_bits() > 128
         {
             return true;
@@ -2434,6 +2435,37 @@ impl X86 {
         let to_bytes = generic_op_name("cvt_to_bytes", vec_ty);
         let from_bytes = generic_op_name("cvt_from_bytes", vec_ty);
 
+        if *self == Self::Avx512
+            && granularity == WithinBlocks
+            && vec_ty.scalar != ScalarType::Mask
+            && vec_ty.n_bits() >= 256
+        {
+            let alignr = format_ident!("dyn_alignr_{}", vec_ty.n_bits());
+            let byte_shift = if scalar_bytes == 1 {
+                quote! { SHIFT }
+            } else {
+                quote! { SHIFT * #scalar_bytes }
+            };
+
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        if SHIFT == 0 {
+                            return a;
+                        }
+                        if SHIFT >= #max_shift {
+                            return b;
+                        }
+
+                        let a = self.#to_bytes(a).val.0;
+                        let b = self.#to_bytes(b).val.0;
+                        let result = #alignr(b, a, #byte_shift);
+                        self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
+                    }
+                }
+            };
+        }
+
         if *self == Self::Avx512 && granularity == AcrossBlocks && vec_ty.n_bits() >= 256 {
             let byte_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8);
             let base_idx = avx512_index_vector(&byte_ty, 0..byte_ty.len);
@@ -3259,10 +3291,7 @@ impl X86 {
         let vec_widths: &[usize] = match self {
             Self::Sse4_2 => &[128],
             Self::Avx2 => &[128, 256],
-            // AVX-512 uses byte-wise permutex2var for 256/512-bit slide operations.
-            // It only needs the legacy alignr helper for 128-bit slides and for
-            // wider within-block slides that decompose through 128-bit lanes.
-            Self::Avx512 => &[128],
+            Self::Avx512 => &[128, 256, 512],
         };
 
         for vec_ty in vec_widths

From 85b44c9521e4104cafe1ee4064a9025ce346a53b Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 21:45:12 +0100
Subject: [PATCH 15/55] Remove stale tests for mask slide APIs; they were under
 #[cfg(false)] so they didn't show up earlier when I removed those methods.

---
 .../tests/harness/slide_exhaustive.rs         | 39 -------------------
 1 file changed, 39 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/slide_exhaustive.rs b/fearless_simd_tests/tests/harness/slide_exhaustive.rs
index 1b82d4548..f41752646 100644
--- a/fearless_simd_tests/tests/harness/slide_exhaustive.rs
+++ b/fearless_simd_tests/tests/harness/slide_exhaustive.rs
@@ -251,42 +251,3 @@ test_slide_exhaustive!(slide_exhaustive_i16x32, i16x32, i16, 32, vec32, block8);
 test_slide_exhaustive!(slide_exhaustive_u16x32, u16x32, u16, 32, vec32, block8);
 test_slide_exhaustive!(slide_exhaustive_i32x16, i32x16, i32, 16, vec16, block4);
 test_slide_exhaustive!(slide_exhaustive_u32x16, u32x16, u32, 16, vec16, block4);
-
-// Mask types (128-bit)
-test_slide_exhaustive!(slide_exhaustive_mask8x16, mask8x16, i8, 16, vec16, block16);
-test_slide_exhaustive!(slide_exhaustive_mask16x8, mask16x8, i16, 8, vec8, block8);
-test_slide_exhaustive!(slide_exhaustive_mask32x4, mask32x4, i32, 4, vec4, block4);
-test_slide_exhaustive!(slide_exhaustive_mask64x2, mask64x2, i64, 2, vec2, block2);
-
-// Mask types (256-bit)
-test_slide_exhaustive!(slide_exhaustive_mask8x32, mask8x32, i8, 32, vec32, block16);
-test_slide_exhaustive!(
-    slide_exhaustive_mask16x16,
-    mask16x16,
-    i16,
-    16,
-    vec16,
-    block8
-);
-test_slide_exhaustive!(slide_exhaustive_mask32x8, mask32x8, i32, 8, vec8, block4);
-test_slide_exhaustive!(slide_exhaustive_mask64x4, mask64x4, i64, 4, vec4, block2);
-
-// Mask types (512-bit)
-test_slide_exhaustive!(slide_exhaustive_mask8x64, mask8x64, i8, 64, vec64, block16);
-test_slide_exhaustive!(
-    slide_exhaustive_mask16x32,
-    mask16x32,
-    i16,
-    32,
-    vec32,
-    block8
-);
-test_slide_exhaustive!(
-    slide_exhaustive_mask32x16,
-    mask32x16,
-    i32,
-    16,
-    vec16,
-    block4
-);
-test_slide_exhaustive!(slide_exhaustive_mask64x8, mask64x8, i64, 8, vec8, block2);

From 1c558ca84a3c350a9358bc3cf03a55032601415c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 21:55:05 +0100
Subject: [PATCH 16/55] consistent clippy error messages

---
 .clippy.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.clippy.toml b/.clippy.toml
index ea0a2fd43..c9afb65bc 100644
--- a/.clippy.toml
+++ b/.clippy.toml
@@ -9,7 +9,7 @@ trivial-copy-size-limit = 16
 
 disallowed-methods = [
     { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
-    { path = "std::mem::transmute_copy", reason = "Use a checked wrapper so equal sizes are asserted at compile time." },
+    { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
 ]
 
 # END LINEBENDER LINT SET

From 6c8f7d7c1fe11069930f4ce9729caae0843cd23c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 21:57:42 +0100
Subject: [PATCH 17/55] satisfy Clippy

---
 fearless_simd/src/generated/avx512.rs     | 32 +++++++-----
 fearless_simd/src/generated/simd_trait.rs | 14 ++---
 fearless_simd_gen/src/level.rs            |  7 +++
 fearless_simd_gen/src/mk_x86.rs           | 64 +++++++++++++++++++----
 fearless_simd_gen/src/ops.rs              |  2 +-
 5 files changed, 88 insertions(+), 31 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 7bd4f5441..283b498d9 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -3,6 +3,14 @@
 
 // This file is autogenerated by fearless_simd_gen
 
+#![allow(
+    clippy::identity_op,
+    reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+)]
+#![allow(
+    clippy::useless_conversion,
+    reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+)]
 use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
@@ -1022,7 +1030,7 @@ impl Simd for Avx512 {
             16usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask8x16 {
             val: (bits) as _,
@@ -1613,7 +1621,7 @@ impl Simd for Avx512 {
             8usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask16x8 {
             val: (bits) as _,
@@ -2214,7 +2222,7 @@ impl Simd for Avx512 {
             4usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask32x4 {
             val: (bits) as _,
@@ -2605,7 +2613,7 @@ impl Simd for Avx512 {
             2usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask64x2 {
             val: (bits) as _,
@@ -3920,7 +3928,7 @@ impl Simd for Avx512 {
             32usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask8x32 {
             val: (bits) as _,
@@ -4705,7 +4713,7 @@ impl Simd for Avx512 {
             16usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask16x16 {
             val: (bits) as _,
@@ -5471,7 +5479,7 @@ impl Simd for Avx512 {
             8usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask32x8 {
             val: (bits) as _,
@@ -5940,7 +5948,7 @@ impl Simd for Avx512 {
             4usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask64x4 {
             val: (bits) as _,
@@ -7359,7 +7367,7 @@ impl Simd for Avx512 {
             64usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask8x64 {
             val: bits,
@@ -8203,7 +8211,7 @@ impl Simd for Avx512 {
             32usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask16x32 {
             val: (bits) as _,
@@ -8993,7 +9001,7 @@ impl Simd for Avx512 {
             16usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask32x16 {
             val: (bits) as _,
@@ -9471,7 +9479,7 @@ impl Simd for Avx512 {
             8usize
         );
         let bit = 1u64 << index;
-        let bits = u64::from((*a).val);
+        let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
         *a = mask64x8 {
             val: (bits) as _,
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 1ecd25438..59357355e 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -150,7 +150,7 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
@@ -875,7 +875,7 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
@@ -1008,7 +1008,7 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
@@ -1755,7 +1755,7 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
@@ -1892,7 +1892,7 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
@@ -2633,7 +2633,7 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
@@ -2885,7 +2885,7 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index 0a5d2735a..d7e2f2f5d 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -46,6 +46,10 @@ pub(crate) trait Level {
     /// Any additional imports or supporting code necessary for the module (for instance, importing
     /// implementation-specific functions from `core::arch`).
     fn make_module_prelude(&self) -> TokenStream;
+    /// Inner attributes to place at the top of the generated module.
+    fn make_module_attrs(&self) -> TokenStream {
+        TokenStream::new()
+    }
     /// The body of the SIMD token's inherent `impl` block. By convention, this contains an unsafe `new_unchecked`
     /// method for constructing a SIMD token that may not be supported on current hardware, or a safe `new` method for
     /// constructing a SIMD token that is statically known to be supported.
@@ -261,6 +265,7 @@ pub(crate) trait Level {
         let level_tok = self.token();
         let token_doc = self.token_doc();
         let imports = type_imports();
+        let module_attrs = self.make_module_attrs();
         let module_prelude = self.make_module_prelude();
         let impl_body = self.make_impl_body();
         let arch_types_impl = self.impl_arch_types();
@@ -269,6 +274,8 @@ pub(crate) trait Level {
         let footer = self.make_module_footer();
 
         quote! {
+            #module_attrs
+
             use crate::{prelude::*, seal::Seal, arch_types::ArchTypes, Level};
 
             #imports
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index cdb307440..5c9b8ffab 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -104,6 +104,23 @@ impl Level for X86 {
         }
     }
 
+    fn make_module_attrs(&self) -> TokenStream {
+        if *self != Self::Avx512 {
+            return TokenStream::new();
+        }
+
+        quote! {
+            #![allow(
+                clippy::identity_op,
+                reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+            )]
+            #![allow(
+                clippy::useless_conversion,
+                reason = "AVX-512 mask code is generated uniformly for all __mmask widths"
+            )]
+        }
+    }
+
     fn make_module_footer(&self) -> TokenStream {
         let alignr_helpers = self.dyn_alignr_helpers();
         let slide_helpers = match self {
@@ -706,13 +723,12 @@ fn avx512_mask_register_bits(vec_ty: &VecType) -> usize {
 }
 
 fn avx512_mask_lane_bits(vec_ty: &VecType) -> TokenStream {
-    let bits = if vec_ty.len == 64 {
+    if vec_ty.len == 64 {
         quote! { u64::MAX }
     } else {
         let bits = (1_u64 << vec_ty.len) - 1;
         quote! { #bits }
-    };
-    bits
+    }
 }
 
 fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
@@ -940,7 +956,11 @@ impl X86 {
         vec_ty: &VecType,
         kind: crate::ops::RefKind,
     ) -> TokenStream {
-        assert_eq!(vec_ty.scalar, ScalarType::Mask);
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "AVX-512 mask array loads only operate on mask types"
+        );
         let movepi_mask = intrinsic_ident(
             &format!("movepi{}", vec_ty.scalar_bits),
             "mask",
@@ -970,7 +990,11 @@ impl X86 {
         vec_ty: &VecType,
         kind: crate::ops::RefKind,
     ) -> TokenStream {
-        assert_eq!(vec_ty.scalar, ScalarType::Mask);
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "AVX-512 mask array stores only operate on mask types"
+        );
         assert!(
             kind == crate::ops::RefKind::Value,
             "mask array references are not exposed"
@@ -995,9 +1019,13 @@ impl X86 {
         method_sig: TokenStream,
         vec_ty: &VecType,
     ) -> TokenStream {
-        assert_eq!(vec_ty.scalar, ScalarType::Mask);
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "AVX-512 mask set only operates on mask types"
+        );
         let len = vec_ty.len;
-        let bits = avx512_mask_bits_expr(quote! { *a });
+        let bits = avx512_mask_bits_expr(quote! { a });
         let result = avx512_mask_value(vec_ty, quote! { bits });
 
         quote! {
@@ -1657,8 +1685,14 @@ impl X86 {
     }
 
     fn handle_avx512_narrow_variable_shift(&self, method: &str, vec_ty: &VecType) -> TokenStream {
-        assert!(*self == Self::Avx512);
-        assert!(matches!(vec_ty.scalar_bits, 8 | 16));
+        assert!(
+            *self == Self::Avx512,
+            "narrow variable shifts are specialized for AVX-512"
+        );
+        assert!(
+            matches!(vec_ty.scalar_bits, 8 | 16),
+            "narrow variable shifts only handle 8-bit and 16-bit lanes"
+        );
         let name = match (method, vec_ty.scalar) {
             ("shrv", ScalarType::Int) => "srav",
             ("shrv", _) => "srlv",
@@ -3094,7 +3128,11 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
-        assert_eq!(vec_ty.n_bits(), 512);
+        assert_eq!(
+            vec_ty.n_bits(),
+            512,
+            "AVX-512 interleaved loads only specialize 512-bit vectors"
+        );
         let load_unaligned = intrinsic_ident("loadu", coarse_type(vec_ty), vec_ty.n_bits());
         let permute = avx512_permutexvar_intrinsic(vec_ty);
         let indices = avx512_index_vector(
@@ -3263,7 +3301,11 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
-        assert_eq!(vec_ty.n_bits(), 512);
+        assert_eq!(
+            vec_ty.n_bits(),
+            512,
+            "AVX-512 interleaved stores only specialize 512-bit vectors"
+        );
         let store_unaligned = intrinsic_ident("storeu", coarse_type(vec_ty), vec_ty.n_bits());
         let permute = avx512_permutexvar_intrinsic(vec_ty);
         let indices = avx512_index_vector(
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index dd9cc7f65..860a38382 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -626,7 +626,7 @@ const FLOAT_OPS: &[Op] = &[
         "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
          This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
          On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
-         On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
+         On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \
          The precision of this operation may change as new platform support is added.",
     ),
     Op::new(

From e475ae12fab9a50838db281acee9efb6071c8e95 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:00:03 +0100
Subject: [PATCH 18/55] get rid of useless extra braces

---
 fearless_simd/src/generated/avx2.rs   | 120 +++++++++++++-------------
 fearless_simd/src/generated/avx512.rs |  96 ++++++++++-----------
 fearless_simd/src/generated/neon.rs   | 120 +++++++++++++-------------
 fearless_simd/src/generated/sse4_2.rs | 120 +++++++++++++-------------
 fearless_simd/src/generated/wasm.rs   | 120 +++++++++++++-------------
 fearless_simd_gen/src/generic.rs      |  31 +++----
 6 files changed, 304 insertions(+), 303 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 40d2c7d8c..7a518cc95 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -103,14 +103,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -413,14 +413,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -647,14 +647,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -890,7 +890,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -985,14 +985,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1194,14 +1194,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1412,7 +1412,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -1507,14 +1507,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1718,14 +1718,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1944,7 +1944,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -2034,14 +2034,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2276,7 +2276,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -2366,14 +2366,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2739,14 +2739,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3056,14 +3056,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3387,7 +3387,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -3499,14 +3499,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3797,14 +3797,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4116,7 +4116,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4226,14 +4226,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4498,14 +4498,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4788,7 +4788,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4890,14 +4890,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5195,7 +5195,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -5298,14 +5298,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5718,14 +5718,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6000,14 +6000,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6327,7 +6327,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -6475,14 +6475,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6766,14 +6766,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7120,7 +7120,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -7245,14 +7245,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7532,14 +7532,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7851,7 +7851,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -7984,14 +7984,14 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -8312,7 +8312,7 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 283b498d9..0f0704cc6 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -111,14 +111,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -428,14 +428,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -713,14 +713,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1121,14 +1121,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1355,14 +1355,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1712,14 +1712,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1948,14 +1948,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2313,14 +2313,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2704,14 +2704,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3106,14 +3106,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3501,14 +3501,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4033,14 +4033,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4355,14 +4355,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4818,14 +4818,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5128,14 +5128,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5584,14 +5584,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6053,14 +6053,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6482,14 +6482,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6901,14 +6901,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7464,14 +7464,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7806,14 +7806,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -8308,14 +8308,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -8630,14 +8630,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -9098,14 +9098,14 @@ impl Simd for Avx512 {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index 2eaccf475..fcb08a150 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -93,14 +93,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -358,14 +358,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -568,14 +568,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -781,7 +781,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -885,14 +885,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1095,14 +1095,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1304,7 +1304,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -1399,14 +1399,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1613,14 +1613,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1822,7 +1822,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -1917,14 +1917,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2157,7 +2157,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -2253,14 +2253,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2657,14 +2657,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2968,14 +2968,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3274,7 +3274,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -3400,14 +3400,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3711,14 +3711,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4026,7 +4026,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4152,14 +4152,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4468,14 +4468,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4771,7 +4771,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4897,14 +4897,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5254,7 +5254,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -5380,14 +5380,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5801,14 +5801,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6121,14 +6121,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6439,7 +6439,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -6556,14 +6556,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6885,14 +6885,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7222,7 +7222,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -7342,14 +7342,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7667,14 +7667,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7987,7 +7987,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -8104,14 +8104,14 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -8470,7 +8470,7 @@ impl Simd for Neon {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index a2cf7f67b..1a63c3a99 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -129,14 +129,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -442,14 +442,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -679,14 +679,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -930,7 +930,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -1028,14 +1028,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1240,14 +1240,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1461,7 +1461,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -1559,14 +1559,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1773,14 +1773,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2002,7 +2002,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -2095,14 +2095,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2340,7 +2340,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -2434,14 +2434,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2816,14 +2816,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3105,14 +3105,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3389,7 +3389,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -3513,14 +3513,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3802,14 +3802,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4097,7 +4097,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4223,14 +4223,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4517,14 +4517,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4798,7 +4798,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4922,14 +4922,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5257,7 +5257,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -5381,14 +5381,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5801,14 +5801,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6083,14 +6083,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6410,7 +6410,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -6564,14 +6564,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6855,14 +6855,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7201,7 +7201,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -7326,14 +7326,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7613,14 +7613,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7932,7 +7932,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -8049,14 +8049,14 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -8377,7 +8377,7 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 6ace3b9c1..9ad776c66 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -92,14 +92,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
         f32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -398,14 +398,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
         i8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -623,14 +623,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
         u8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -847,7 +847,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
         mask8x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -945,14 +945,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
         i16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1154,14 +1154,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
         u16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1360,7 +1360,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
         mask16x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -1456,14 +1456,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
         i32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1669,14 +1669,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
         u32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -1875,7 +1875,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
         mask32x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -1971,14 +1971,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
         f64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2236,7 +2236,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
         mask64x2 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -2333,14 +2333,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
         f32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -2715,14 +2715,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
         i8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3004,14 +3004,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
         u8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3288,7 +3288,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
         mask8x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -3412,14 +3412,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
         i16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3701,14 +3701,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
         u16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -3994,7 +3994,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
         mask16x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4118,14 +4118,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
         i32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4412,14 +4412,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
         u32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -4693,7 +4693,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         mask32x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -4817,14 +4817,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
         f64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5152,7 +5152,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
         mask64x4 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -5276,14 +5276,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
         f32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5693,14 +5693,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
         i8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -5975,14 +5975,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
         u8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6313,7 +6313,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         mask8x64 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -6430,14 +6430,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
         i16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -6721,14 +6721,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
         u16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7054,7 +7054,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         mask16x32 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -7174,14 +7174,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
         i32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7461,14 +7461,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
         u32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -7777,7 +7777,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         mask32x16 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
@@ -7894,14 +7894,14 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
     #[inline(always)]
     fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
         f64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(val) } },
+            val: unsafe { crate::support::checked_transmute_copy(val) },
             simd: self,
         }
     }
@@ -8222,7 +8222,7 @@ impl Simd for WasmSimd128 {
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
         mask64x8 {
-            val: { unsafe { crate::support::checked_transmute_copy(&val) } },
+            val: unsafe { crate::support::checked_transmute_copy(&val) },
             simd: self,
         }
     }
diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index c4a11ee9e..a68378ee5 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -377,22 +377,23 @@ pub(crate) fn generic_from_array(
     // There are architecture-specific "load" intrinsics, but they can actually be *worse* for performance. If they
     // lower to LLVM intrinsics, they will likely not be optimized until much later in the pipeline (if at all),
     // resulting in substantially worse codegen. See https://github.com/linebender/fearless_simd/pull/185.
-    let expr = quote! {{
-        // Safety: The native vector type backing any implementation will be:
-        // - A `#[repr(simd)]` type, which has the same layout as an array of scalars
-        // - An array of `#[repr(simd)]` types
-        // - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types
-        //
-        // These all have the same layout as a flat array of the corresponding scalars. `checked_transmute_copy`
-        // statically verifies that the source and destination sizes match. The native vector types probably have
-        // greater alignment requirements than the source array type we're copying from, but that's explicitly allowed by
-        // transmute_copy:
-        //
-        // > This function will unsafely assume the pointer src is valid for size_of::<Dst> bytes by transmuting &Src to
-        // > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has
-        // > stricter alignment requirements than &Src).**
+    //
+    // Safety: The native vector type backing any implementation will be:
+    // - A `#[repr(simd)]` type, which has the same layout as an array of scalars
+    // - An array of `#[repr(simd)]` types
+    // - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types
+    //
+    // These all have the same layout as a flat array of the corresponding scalars. `checked_transmute_copy`
+    // statically verifies that the source and destination sizes match. The native vector types probably have
+    // greater alignment requirements than the source array type we're copying from, but that's explicitly allowed by
+    // transmute_copy:
+    //
+    // > This function will unsafely assume the pointer src is valid for size_of::<Dst> bytes by transmuting &Src to
+    // > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has
+    // > stricter alignment requirements than &Src).**
+    let expr = quote! {
         unsafe { crate::support::checked_transmute_copy(#inner_ref) }
-    }};
+    };
     let vec_rust = vec_ty.rust();
 
     quote! {

From 6f1081fdd226f87fd20a58b4098a6b5c1b6046db Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:07:31 +0100
Subject: [PATCH 19/55] KISS the native type mask roundtrip tests

---
 .../lm_generated/mask_roundtrip_x86.rs        | 443 +++++++++---------
 1 file changed, 230 insertions(+), 213 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
index 385a516cd..3f21c9391 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
@@ -5,37 +5,12 @@
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
+use core::convert::TryFrom;
+use core::mem::size_of;
 
 use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;
 
-const INTERESTING_32: &[u64] = &[
-    0x0000_0000,
-    0x0000_0001,
-    0x8000_0000,
-    0x0000_ffff,
-    0xffff_0000,
-    0x5555_5555,
-    0xaaaa_aaaa,
-    0x8000_aa55,
-    0xffff_ffff,
-    0xffff_ffff_0000_0000,
-    0xffff_ffff_8000_aa55,
-    0xffff_ffff_ffff_ffff,
-];
-
-const INTERESTING_64: &[u64] = &[
-    0x0000_0000_0000_0000,
-    0x0000_0000_0000_0001,
-    0x8000_0000_0000_0000,
-    0x0000_0000_ffff_ffff,
-    0xffff_ffff_0000_0000,
-    0x5555_5555_5555_5555,
-    0xaaaa_aaaa_aaaa_aaaa,
-    0x8000_0001_5555_aaab,
-    0xffff_ffff_ffff_ffff,
-];
-
 fn lane_mask(lanes: usize) -> u64 {
     if lanes == u64::BITS as usize {
         u64::MAX
@@ -44,197 +19,239 @@ fn lane_mask(lanes: usize) -> u64 {
     }
 }
 
-trait MaskArch: Copy + Eq + core::fmt::Debug {
-    fn from_bits(bits: u64) -> Self;
+fn lanes_from_bits<L, const LANES: usize>(bits: u64) -> [L; LANES]
+where
+    L: Copy + From<i8>,
+{
+    let bits = bits & lane_mask(LANES);
+    core::array::from_fn(|i| {
+        if ((bits >> i) & 1) != 0 {
+            L::from(-1)
+        } else {
+            L::from(0)
+        }
+    })
 }
 
-impl MaskArch for u8 {
-    fn from_bits(bits: u64) -> Self {
-        Self::try_from(bits).expect("masked bits fit in __mmask8")
-    }
-}
-
-impl MaskArch for u16 {
-    fn from_bits(bits: u64) -> Self {
-        Self::try_from(bits).expect("masked bits fit in __mmask16")
-    }
-}
-
-impl MaskArch for u32 {
-    fn from_bits(bits: u64) -> Self {
-        Self::try_from(bits).expect("masked bits fit in __mmask32")
-    }
-}
-
-impl MaskArch for u64 {
-    fn from_bits(bits: u64) -> Self {
-        bits
-    }
-}
-
-macro_rules! assert_native_vector_roundtrip {
-    ($simd:expr, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $bits:expr) => {{
-        let bits = $bits;
-        let expected_bits = bits & lane_mask($lanes);
-        let expected_lanes: [$lane; $lanes] = core::array::from_fn(|i| {
-            if ((expected_bits >> i) & 1) != 0 {
-                -1
-            } else {
-                0
-            }
-        });
-
-        let mask = $mask::from_bitmask($simd, bits);
-        let arch: $arch = mask.into();
-        // Safety: these x86 vector types have the same size and lane layout as the signed
-        // integer arrays used for mask values.
-        let lanes = unsafe { core::mem::transmute::<$arch, [$lane; $lanes]>(arch) };
-        assert_eq!(
-            lanes,
-            expected_lanes,
-            "{} -> {} lane values for {bits:#018x}",
-            stringify!($mask),
-            stringify!($arch)
-        );
-
-        // Safety: this builds the native x86 vector value from the lane representation expected
-        // by the public mask conversion.
-        let arch = unsafe { core::mem::transmute::<[$lane; $lanes], $arch>(expected_lanes) };
-        let mask = $mask::simd_from($simd, arch);
-        assert_eq!(
-            mask.to_bitmask(),
-            expected_bits,
-            "{} <- {} bitmask for {bits:#018x}",
-            stringify!($mask),
-            stringify!($arch)
-        );
-    }};
-}
-
-macro_rules! assert_native_mask_roundtrip {
-    ($simd:expr, $mask:ident, $arch:ty, $lanes:literal, $bits:expr) => {{
-        let bits = $bits;
-        let expected_bits = bits & lane_mask($lanes);
-        let expected_arch = <$arch as MaskArch>::from_bits(expected_bits);
-
-        let mask = $mask::from_bitmask($simd, bits);
-        let arch: $arch = mask.into();
-        assert_eq!(
-            arch,
-            expected_arch,
-            "{} -> {} for {bits:#018x}",
-            stringify!($mask),
-            stringify!($arch)
-        );
-
-        let mask = $mask::simd_from($simd, expected_arch);
-        assert_eq!(
-            mask.to_bitmask(),
-            expected_bits,
-            "{} <- {} bitmask for {bits:#018x}",
-            stringify!($mask),
-            stringify!($arch)
-        );
-
-        let arch: $arch = mask.into();
-        assert_eq!(
-            arch,
-            expected_arch,
-            "{} -> {} after roundtrip for {bits:#018x}",
-            stringify!($mask),
-            stringify!($arch)
-        );
-    }};
-}
-
-macro_rules! native_vector_roundtrip_exhaustive {
-    ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal) => {
-        #[simd_test]
-        fn $test<S: Simd>(simd: S) {
-            for bits in 0..=0xffff_u64 {
-                assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits);
-            }
-        }
-    };
+fn assert_native_vector_roundtrip<S, M, A, L, const LANES: usize>(simd: S, bits: u64)
+where
+    S: Simd,
+    M: SimdMask<S> + SimdFrom<A, S> + Into<A>,
+    A: Copy,
+    L: Copy + Eq + core::fmt::Debug + From<i8>,
+{
+    let expected_bits = bits & lane_mask(LANES);
+    let expected_lanes = lanes_from_bits::<L, LANES>(bits);
+
+    assert_eq!(
+        size_of::<A>(),
+        size_of::<[L; LANES]>()
+    );
+
+    let mask = M::from_bitmask(simd, bits);
+    let arch: A = mask.into();
+    // Safety: the size assertion above verifies that the x86 vector type has
+    // the same size as the signed integer lane representation used for masks.
+    let lanes = unsafe { core::mem::transmute_copy::<A, [L; LANES]>(&arch) };
+    assert_eq!(lanes, expected_lanes);
+
+    // Safety: this builds the native x86 vector value from the lane
+    // representation expected by the public mask conversion.
+    let arch = unsafe { core::mem::transmute_copy::<[L; LANES], A>(&expected_lanes) };
+    let mask = M::simd_from(simd, arch);
+    assert_eq!(mask.to_bitmask(), expected_bits);
 }
 
-macro_rules! native_vector_roundtrip_interesting {
-    ($test:ident, $mask:ident, $arch:ty, $lane:ty, $lanes:literal, $values:ident) => {
-        #[simd_test]
-        fn $test<S: Simd>(simd: S) {
-            for &bits in $values {
-                assert_native_vector_roundtrip!(simd, $mask, $arch, $lane, $lanes, bits);
-            }
-        }
-    };
+fn assert_native_mask_roundtrip<S, M, A, const LANES: usize>(simd: S, bits: u64)
+where
+    S: Simd,
+    M: SimdMask<S> + SimdFrom<A, S> + Into<A>,
+    A: Copy + Eq + core::fmt::Debug + TryFrom<u64>,
+    A::Error: core::fmt::Debug,
+{
+    let expected_bits = bits & lane_mask(LANES);
+    let expected_arch = A::try_from(expected_bits).expect("masked bits fit in native mask type");
+
+    let mask = M::from_bitmask(simd, bits);
+    let arch: A = mask.into();
+    assert_eq!(arch, expected_arch);
+
+    let mask = M::simd_from(simd, expected_arch);
+    assert_eq!(mask.to_bitmask(), expected_bits);
+
+    let arch: A = mask.into();
+    assert_eq!(arch, expected_arch);
 }
 
-macro_rules! native_mask_roundtrip_exhaustive {
-    ($test:ident, $mask:ident, $arch:ty, $lanes:literal) => {
-        #[simd_test]
-        fn $test<S: Simd>(simd: S) {
-            for bits in 0..=0xffff_u64 {
-                assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits);
-            }
-        }
-    };
+#[simd_test]
+fn mask8x16_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask8x16<S>, __m128i, i8, 16>(simd, bits);
+    }
 }
 
-macro_rules! native_mask_roundtrip_interesting {
-    ($test:ident, $mask:ident, $arch:ty, $lanes:literal, $values:ident) => {
-        #[simd_test]
-        fn $test<S: Simd>(simd: S) {
-            for &bits in $values {
-                assert_native_mask_roundtrip!(simd, $mask, $arch, $lanes, bits);
-            }
-        }
-    };
-}
-
-native_vector_roundtrip_exhaustive!(mask8x16_m128i_roundtrip, mask8x16, __m128i, i8, 16);
-native_vector_roundtrip_exhaustive!(mask16x8_m128i_roundtrip, mask16x8, __m128i, i16, 8);
-native_vector_roundtrip_exhaustive!(mask32x4_m128i_roundtrip, mask32x4, __m128i, i32, 4);
-native_vector_roundtrip_exhaustive!(mask64x2_m128i_roundtrip, mask64x2, __m128i, i64, 2);
-
-native_vector_roundtrip_interesting!(
-    mask8x32_m256i_roundtrip,
-    mask8x32,
-    __m256i,
-    i8,
-    32,
-    INTERESTING_32
-);
-native_vector_roundtrip_exhaustive!(mask16x16_m256i_roundtrip, mask16x16, __m256i, i16, 16);
-native_vector_roundtrip_exhaustive!(mask32x8_m256i_roundtrip, mask32x8, __m256i, i32, 8);
-native_vector_roundtrip_exhaustive!(mask64x4_m256i_roundtrip, mask64x4, __m256i, i64, 4);
-
-native_mask_roundtrip_exhaustive!(mask8x16_mmask16_roundtrip, mask8x16, __mmask16, 16);
-native_mask_roundtrip_exhaustive!(mask16x8_mmask8_roundtrip, mask16x8, __mmask8, 8);
-native_mask_roundtrip_exhaustive!(mask32x4_mmask8_roundtrip, mask32x4, __mmask8, 4);
-native_mask_roundtrip_exhaustive!(mask64x2_mmask8_roundtrip, mask64x2, __mmask8, 2);
-native_mask_roundtrip_interesting!(
-    mask8x32_mmask32_roundtrip,
-    mask8x32,
-    __mmask32,
-    32,
-    INTERESTING_32
-);
-native_mask_roundtrip_exhaustive!(mask16x16_mmask16_roundtrip, mask16x16, __mmask16, 16);
-native_mask_roundtrip_exhaustive!(mask32x8_mmask8_roundtrip, mask32x8, __mmask8, 8);
-native_mask_roundtrip_exhaustive!(mask64x4_mmask8_roundtrip, mask64x4, __mmask8, 4);
-native_mask_roundtrip_interesting!(
-    mask8x64_mmask64_roundtrip,
-    mask8x64,
-    __mmask64,
-    64,
-    INTERESTING_64
-);
-native_mask_roundtrip_interesting!(
-    mask16x32_mmask32_roundtrip,
-    mask16x32,
-    __mmask32,
-    32,
-    INTERESTING_32
-);
-native_mask_roundtrip_exhaustive!(mask32x16_mmask16_roundtrip, mask32x16, __mmask16, 16);
-native_mask_roundtrip_exhaustive!(mask64x8_mmask8_roundtrip, mask64x8, __mmask8, 8);
+#[simd_test]
+fn mask16x8_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask16x8<S>, __m128i, i16, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x4_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask32x4<S>, __m128i, i32, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x2_m128i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask64x2<S>, __m128i, i64, 2>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x32_m256i_roundtrip<S: Simd>(simd: S) {
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x0000_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x0000_0001);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x8000_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x0000_ffff);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x5555_5555);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xaaaa_aaaa);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0x8000_aa55);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff_0000_0000);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff_8000_aa55);
+    assert_native_vector_roundtrip::<S, mask8x32<S>, __m256i, i8, 32>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask16x16_m256i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask16x16<S>, __m256i, i16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x8_m256i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask32x8<S>, __m256i, i32, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x4_m256i_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_vector_roundtrip::<S, mask64x4<S>, __m256i, i64, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x16_mmask16_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask8x16<S>, __mmask16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask16x8_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask16x8<S>, __mmask8, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x4_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask32x4<S>, __mmask8, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x2_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask64x2<S>, __mmask8, 2>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x32_mmask32_roundtrip<S: Simd>(simd: S) {
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x0000_0001);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x8000_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x0000_ffff);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x5555_5555);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xaaaa_aaaa);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0x8000_aa55);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff_8000_aa55);
+    assert_native_mask_roundtrip::<S, mask8x32<S>, __mmask32, 32>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask16x16_mmask16_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask16x16<S>, __mmask16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask32x8_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask32x8<S>, __mmask8, 8>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x4_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask64x4<S>, __mmask8, 4>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask8x64_mmask64_roundtrip<S: Simd>(simd: S) {
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x0000_0000_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x0000_0000_0000_0001);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x8000_0000_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x0000_0000_ffff_ffff);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0xffff_ffff_0000_0000);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x5555_5555_5555_5555);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0xaaaa_aaaa_aaaa_aaaa);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0x8000_0001_5555_aaab);
+    assert_native_mask_roundtrip::<S, mask8x64<S>, __mmask64, 64>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask16x32_mmask32_roundtrip<S: Simd>(simd: S) {
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x0000_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x0000_0001);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x8000_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x0000_ffff);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x5555_5555);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xaaaa_aaaa);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0x8000_aa55);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff_0000_0000);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff_8000_aa55);
+    assert_native_mask_roundtrip::<S, mask16x32<S>, __mmask32, 32>(simd, 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask32x16_mmask16_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask32x16<S>, __mmask16, 16>(simd, bits);
+    }
+}
+
+#[simd_test]
+fn mask64x8_mmask8_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        assert_native_mask_roundtrip::<S, mask64x8<S>, __mmask8, 8>(simd, bits);
+    }
+}

From 1e2a0961d100693c6dd2a790f1829b8ae69407e7 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:09:35 +0100
Subject: [PATCH 20/55] cargo fmt

---
 .../tests/harness/lm_generated/mask_roundtrip_x86.rs         | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
index 3f21c9391..cade583d3 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
@@ -43,10 +43,7 @@ where
     let expected_bits = bits & lane_mask(LANES);
     let expected_lanes = lanes_from_bits::<L, LANES>(bits);
 
-    assert_eq!(
-        size_of::<A>(),
-        size_of::<[L; LANES]>()
-    );
+    assert_eq!(size_of::<A>(), size_of::<[L; LANES]>());
 
     let mask = M::from_bitmask(simd, bits);
     let arch: A = mask.into();

From 7fc16d4c9a8010a61e69c5b97f5b5c192c2761f3 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:15:37 +0100
Subject: [PATCH 21/55] Satisfy clippy some more. Hoisted by my own restriction
 lint.

---
 .../harness/lm_generated/mask_roundtrip.rs    |  6 +++---
 .../lm_generated/mask_roundtrip_x86.rs        | 20 +++++++++++++++++--
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
index ecc6f3c52..5433ce2a6 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip.rs
@@ -8,17 +8,17 @@ use fearless_simd_dev_macros::simd_test;
 /// `to_bitmask` and `test` in sync with the expected compact bitmask.
 fn assert_mask_set_roundtrip<S: Simd, M: SimdMask<S>>(simd: S) {
     let mut mask = M::from_bitmask(simd, 0);
-    let mut expected = 0u64;
+    let mut expected = 0_u64;
     for i in 0..M::N {
         mask.set(i, true);
-        expected |= 1u64 << i;
+        expected |= 1_u64 << i;
         assert_eq!(mask.to_bitmask(), expected);
         assert!(mask.test(i));
     }
 
     for i in 0..M::N {
         mask.set(i, false);
-        expected &= !(1u64 << i);
+        expected &= !(1_u64 << i);
         assert_eq!(mask.to_bitmask(), expected);
         assert!(!mask.test(i));
     }
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
index cade583d3..cbf2cacaf 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
@@ -33,6 +33,22 @@ where
     })
 }
 
+#[allow(
+    clippy::disallowed_methods,
+    reason = "test-only checked wrapper around transmute_copy"
+)]
+unsafe fn checked_transmute_copy<Src, Dst>(src: &Src) -> Dst {
+    const {
+        assert!(
+            size_of::<Src>() == size_of::<Dst>(),
+            "checked_transmute_copy requires source and destination to have the same size"
+        );
+    }
+    // Safety: the caller upholds `transmute_copy`'s validity requirements, and
+    // the const assertion above verifies that the source and destination sizes match.
+    unsafe { core::mem::transmute_copy(src) }
+}
+
 fn assert_native_vector_roundtrip<S, M, A, L, const LANES: usize>(simd: S, bits: u64)
 where
     S: Simd,
@@ -49,12 +65,12 @@ where
     let arch: A = mask.into();
     // Safety: the size assertion above verifies that the x86 vector type has
     // the same size as the signed integer lane representation used for masks.
-    let lanes = unsafe { core::mem::transmute_copy::<A, [L; LANES]>(&arch) };
+    let lanes = unsafe { checked_transmute_copy::<A, [L; LANES]>(&arch) };
     assert_eq!(lanes, expected_lanes);
 
     // Safety: this builds the native x86 vector value from the lane
     // representation expected by the public mask conversion.
-    let arch = unsafe { core::mem::transmute_copy::<[L; LANES], A>(&expected_lanes) };
+    let arch = unsafe { checked_transmute_copy::<[L; LANES], A>(&expected_lanes) };
     let mask = M::simd_from(simd, arch);
     assert_eq!(mask.to_bitmask(), expected_bits);
 }

From 359650d70595646aa31a69afd0af834a1c8c0fd1 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:17:29 +0100
Subject: [PATCH 22/55] Satisfy the toml formatting check

---
 .clippy.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.clippy.toml b/.clippy.toml
index c9afb65bc..19a4f3c3b 100644
--- a/.clippy.toml
+++ b/.clippy.toml
@@ -8,8 +8,8 @@
 trivial-copy-size-limit = 16
 
 disallowed-methods = [
-    { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
-    { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
+  { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
+  { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
 ]
 
 # END LINEBENDER LINT SET

From 37df3e31af415d0cfcb62f8d27ee67b26a5e7673 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:22:05 +0100
Subject: [PATCH 23/55] Stick an #[expect] onto checked_transmute_copy on
 wasm32, otherwise we get dead code warnings

---
 fearless_simd/src/support.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs
index fce929808..3782f2081 100644
--- a/fearless_simd/src/support.rs
+++ b/fearless_simd/src/support.rs
@@ -37,6 +37,13 @@ pub struct Aligned512<T>(pub T);
 /// `src` must be valid to copy as `Dst`. This helper only checks the size invariant; the caller
 /// is still responsible for the rest of `transmute_copy`'s safety contract.
 #[inline(always)]
+#[cfg_attr(
+    target_arch = "wasm32",
+    expect(
+        dead_code,
+        reason = "native vector conversions are not used by the wasm32 libm Clippy build"
+    )
+)]
 #[allow(
     clippy::disallowed_methods,
     reason = "This is the central checked wrapper around transmute_copy"

From 8825bfbdbe5253e3d8246f3da7a861781063f872 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:25:48 +0100
Subject: [PATCH 24/55] Suppress an apparently buggy Clippy lint; surfaced only
 in `cargo clippy --tests` without a reported location, I've failed to isolate
 it to a specific crate and suppress it there

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 8721b67e4..9203fbead 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,7 +48,7 @@ clippy.disallowed_methods = "deny"
 clippy.doc_markdown = "warn"
 clippy.fn_to_numeric_cast_any = "warn"
 clippy.infinite_loop = "warn"
-clippy.large_stack_arrays = "warn"
+clippy.large_stack_arrays = "allow" # appears to be buggy as of 1.93, fixed in 1.95. TODO: re-enable
 clippy.mismatching_type_param_order = "warn"
 clippy.missing_assert_message = "warn"
 clippy.missing_fields_in_debug = "warn"

From cf3ff7d8f6d2438931d9b85e1ed263ae684c9ed3 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 22:29:40 +0100
Subject: [PATCH 25/55] Satisfy the toml formatter again

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 9203fbead..398c2c514 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,7 +48,7 @@ clippy.disallowed_methods = "deny"
 clippy.doc_markdown = "warn"
 clippy.fn_to_numeric_cast_any = "warn"
 clippy.infinite_loop = "warn"
-clippy.large_stack_arrays = "allow" # appears to be buggy as of 1.93, fixed in 1.95. TODO: re-enable
+clippy.large_stack_arrays = "allow"             # appears to be buggy as of 1.93, fixed in 1.95. TODO: re-enable
 clippy.mismatching_type_param_order = "warn"
 clippy.missing_assert_message = "warn"
 clippy.missing_fields_in_debug = "warn"

From cb5780f331a46642ad22cae3101982720216ae6a Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 25 May 2026 00:07:21 +0100
Subject: [PATCH 26/55] Add miri out-outs for extra slow tests

---
 fearless_simd_tests/tests/harness/lm_generated.rs | 2 ++
 fearless_simd_tests/tests/mod.rs                  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs
index a7d381969..34de5b16e 100644
--- a/fearless_simd_tests/tests/harness/lm_generated.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated.rs
@@ -2,7 +2,9 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 mod extended_512;
+#[cfg(not(miri))] // too slow
 mod mask_roundtrip;
+#[cfg(not(miri))] // too slow
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod mask_roundtrip_x86;
 mod mod_256;
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index 6559ea92d..c3a8306f9 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -10,6 +10,7 @@ use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;
 
 mod harness;
+#[cfg(not(miri))] // too slow
 mod soundness;
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]

From f55271b923a064558720014e97bdc7c7a68c1961 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 25 May 2026 10:01:52 +0100
Subject: [PATCH 27/55] Also enforce that both types are Copy in
 checked_transmute_copy. We can't enforce Pod without an external dependency.

---
 fearless_simd/src/support.rs                                    | 2 +-
 .../tests/harness/lm_generated/mask_roundtrip_x86.rs            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs
index 3782f2081..0de8e4f6a 100644
--- a/fearless_simd/src/support.rs
+++ b/fearless_simd/src/support.rs
@@ -48,7 +48,7 @@ pub struct Aligned512<T>(pub T);
     clippy::disallowed_methods,
     reason = "This is the central checked wrapper around transmute_copy"
 )]
-pub(crate) unsafe fn checked_transmute_copy<Src, Dst>(src: &Src) -> Dst {
+pub(crate) unsafe fn checked_transmute_copy<Src: Copy, Dst: Copy>(src: &Src) -> Dst {
     const {
         assert!(
             size_of::<Src>() == size_of::<Dst>(),
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
index cbf2cacaf..70c565dbc 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
@@ -37,7 +37,7 @@ where
     clippy::disallowed_methods,
     reason = "test-only checked wrapper around transmute_copy"
 )]
-unsafe fn checked_transmute_copy<Src, Dst>(src: &Src) -> Dst {
+unsafe fn checked_transmute_copy<Src: Copy, Dst: Copy>(src: &Src) -> Dst {
     const {
         assert!(
             size_of::<Src>() == size_of::<Dst>(),

From 15f5ab8a3b6564c4a3bfc35c326da6244630892f Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 25 May 2026 16:32:18 +0100
Subject: [PATCH 28/55] Fix disallowed methods setup that got mangled in the
 merge

---
 .clippy.toml                     | 5 -----
 fearless_simd_tests/tests/mod.rs | 4 ++++
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.clippy.toml b/.clippy.toml
index 2cb1be3a3..f93546b19 100644
--- a/.clippy.toml
+++ b/.clippy.toml
@@ -7,11 +7,6 @@
 # 16 bytes is the number of bytes that fits into two 64-bit CPU registers.
 trivial-copy-size-limit = 16
 
-disallowed-methods = [
-  { path = "core::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
-  { path = "std::mem::transmute_copy", reason = "Use crate::support::checked_transmute_copy so equal sizes are asserted at compile time." },
-]
-
 # END LINEBENDER LINT SET
 
 disallowed-methods = [
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index c3a8306f9..bd64c14c9 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -5,6 +5,10 @@
     missing_docs,
     reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
 )]
+#![allow(
+    clippy::disallowed_methods,
+    reason = "fearless_simd_tests has test-only transmute helpers that should not be forced through the library's private checked transmute machinery"
+)]
 
 use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;

From 62337434ea27455ca2105868635679b7e8ce7336 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 25 May 2026 16:34:17 +0100
Subject: [PATCH 29/55] Drop a custom transmute_copy wrapper from tests now
 that it has the same name but different semantics from the production code to
 avoid confusion

---
 .../lm_generated/mask_roundtrip_x86.rs        | 20 ++-----------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
index 70c565dbc..cade583d3 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_roundtrip_x86.rs
@@ -33,22 +33,6 @@ where
     })
 }
 
-#[allow(
-    clippy::disallowed_methods,
-    reason = "test-only checked wrapper around transmute_copy"
-)]
-unsafe fn checked_transmute_copy<Src: Copy, Dst: Copy>(src: &Src) -> Dst {
-    const {
-        assert!(
-            size_of::<Src>() == size_of::<Dst>(),
-            "checked_transmute_copy requires source and destination to have the same size"
-        );
-    }
-    // Safety: the caller upholds `transmute_copy`'s validity requirements, and
-    // the const assertion above verifies that the source and destination sizes match.
-    unsafe { core::mem::transmute_copy(src) }
-}
-
 fn assert_native_vector_roundtrip<S, M, A, L, const LANES: usize>(simd: S, bits: u64)
 where
     S: Simd,
@@ -65,12 +49,12 @@ where
     let arch: A = mask.into();
     // Safety: the size assertion above verifies that the x86 vector type has
     // the same size as the signed integer lane representation used for masks.
-    let lanes = unsafe { checked_transmute_copy::<A, [L; LANES]>(&arch) };
+    let lanes = unsafe { core::mem::transmute_copy::<A, [L; LANES]>(&arch) };
     assert_eq!(lanes, expected_lanes);
 
     // Safety: this builds the native x86 vector value from the lane
     // representation expected by the public mask conversion.
-    let arch = unsafe { checked_transmute_copy::<[L; LANES], A>(&expected_lanes) };
+    let arch = unsafe { core::mem::transmute_copy::<[L; LANES], A>(&expected_lanes) };
     let mask = M::simd_from(simd, arch);
     assert_eq!(mask.to_bitmask(), expected_bits);
 }

From 88bc247f83ffffb89c0177b8ed5aaf4b62ca454c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 26 May 2026 02:25:42 +0100
Subject: [PATCH 30/55] Optimize min_precise/max_precise for AVX-512, expand
 test coverage. AVX-512 has configurable comparison modes that we can use to
 implement the advertised _precise semantics.

---
 fearless_simd/src/generated/avx512.rs         | 72 +++---------------
 fearless_simd_gen/src/mk_x86.rs               | 22 ++----
 .../harness/lm_generated/extended_512.rs      | 76 +++++++++++++++++++
 .../tests/harness/lm_generated/mod_256.rs     | 38 ++++++++++
 fearless_simd_tests/tests/harness/mod.rs      | 34 +++++++++
 5 files changed, 167 insertions(+), 75 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 710888625..908845cb3 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -305,19 +305,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
-        unsafe {
-            let intermediate = _mm_max_ps(a.into(), b.into());
-            let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into());
-            _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
-        unsafe {
-            let intermediate = _mm_min_ps(a.into(), b.into());
-            let b_is_nan = _mm_cmp_ps_mask::<3i32>(b.into(), b.into());
-            _mm_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
@@ -2507,19 +2499,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
-        unsafe {
-            let intermediate = _mm_max_pd(a.into(), b.into());
-            let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into());
-            _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
-        unsafe {
-            let intermediate = _mm_min_pd(a.into(), b.into());
-            let b_is_nan = _mm_cmp_pd_mask::<3i32>(b.into(), b.into());
-            _mm_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
@@ -2969,19 +2953,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            let intermediate = _mm256_max_ps(a.into(), b.into());
-            let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into());
-            _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            let intermediate = _mm256_min_ps(a.into(), b.into());
-            let b_is_nan = _mm256_cmp_ps_mask::<3i32>(b.into(), b.into());
-            _mm256_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
@@ -5829,19 +5805,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            let intermediate = _mm256_max_pd(a.into(), b.into());
-            let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into());
-            _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            let intermediate = _mm256_min_pd(a.into(), b.into());
-            let b_is_nan = _mm256_cmp_pd_mask::<3i32>(b.into(), b.into());
-            _mm256_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
@@ -6336,19 +6304,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            let intermediate = _mm512_max_ps(a.into(), b.into());
-            let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into());
-            _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            let intermediate = _mm512_min_ps(a.into(), b.into());
-            let b_is_nan = _mm512_cmp_ps_mask::<3i32>(b.into(), b.into());
-            _mm512_mask_blend_ps(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
@@ -9365,19 +9325,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            let intermediate = _mm512_max_pd(a.into(), b.into());
-            let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into());
-            _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            let intermediate = _mm512_min_pd(a.into(), b.into());
-            let b_is_nan = _mm512_cmp_pd_mask::<3i32>(b.into(), b.into());
-            _mm512_mask_blend_pd(b_is_nan, intermediate, a.into()).simd_into(self)
-        }
+        unsafe { _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
     }
     #[inline(always)]
     fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index bbbd975a2..ae90f6945 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -1602,24 +1602,16 @@ impl X86 {
             && matches!(method, "min_precise" | "max_precise")
         {
             let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true);
-            let minmax = intrinsic_ident(
-                if method == "max_precise" {
-                    "max"
-                } else {
-                    "min"
-                },
-                suffix,
-                vec_ty.n_bits(),
-            );
-            let cmp = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits());
-            let blend = avx512_mask_blend_intrinsic(vec_ty);
-            let unord = avx512_float_compare_predicate("unord");
+            let range = intrinsic_ident("range", suffix, vec_ty.n_bits());
+            let imm = if method == "max_precise" {
+                0b0101
+            } else {
+                0b0100
+            };
             return quote! {
                 #method_sig {
                     unsafe {
-                        let intermediate = #minmax(a.into(), b.into());
-                        let b_is_nan = #cmp::<#unord>(b.into(), b.into());
-                        #blend(b_is_nan, intermediate, a.into()).simd_into(self)
+                        #range::<#imm>(a.into(), b.into()).simd_into(self)
                     }
                 }
             };
diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
index e06ccf099..3e6bbdfb8 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
@@ -743,6 +743,82 @@ fn min_precise_f32x16_with_nan<S: Simd>(simd: S) {
     assert_eq!(result[15], 5.0);
 }
 
+#[simd_test]
+fn max_precise_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[2.0, -3.0, 0.0, 0.5, 1.0, 5.0, 3.0, 7.0]);
+    let b = f64x8::from_slice(simd, &[1.0, -2.0, 7.0, 3.0, 2.0, 4.0, 6.0, 5.0]);
+    assert_eq!(*a.max_precise(b), [2.0, -2.0, 7.0, 3.0, 2.0, 5.0, 6.0, 7.0]);
+}
+
+#[simd_test]
+fn min_precise_f64x8<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(simd, &[2.0, -3.0, 0.0, 0.5, 1.0, 5.0, 3.0, 7.0]);
+    let b = f64x8::from_slice(simd, &[1.0, -2.0, 7.0, 3.0, 2.0, 4.0, 6.0, 5.0]);
+    assert_eq!(*a.min_precise(b), [1.0, -3.0, 0.0, 0.5, 1.0, 4.0, 3.0, 5.0]);
+}
+
+#[simd_test]
+fn max_precise_f64x8_with_nan<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(
+        simd,
+        &[f64::NAN, -3.0, f64::INFINITY, 0.5, 1.0, f64::NAN, 3.0, 7.0],
+    );
+    let b = f64x8::from_slice(
+        simd,
+        &[
+            1.0,
+            f64::NAN,
+            7.0,
+            f64::NEG_INFINITY,
+            f64::NAN,
+            4.0,
+            6.0,
+            5.0,
+        ],
+    );
+    let result = a.max_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], f64::INFINITY);
+    assert_eq!(result[3], 0.5);
+    assert_eq!(result[4], 1.0);
+    assert_eq!(result[5], 4.0);
+    assert_eq!(result[6], 6.0);
+    assert_eq!(result[7], 7.0);
+}
+
+#[simd_test]
+fn min_precise_f64x8_with_nan<S: Simd>(simd: S) {
+    let a = f64x8::from_slice(
+        simd,
+        &[f64::NAN, -3.0, f64::INFINITY, 0.5, 1.0, f64::NAN, 3.0, 7.0],
+    );
+    let b = f64x8::from_slice(
+        simd,
+        &[
+            1.0,
+            f64::NAN,
+            7.0,
+            f64::NEG_INFINITY,
+            f64::NAN,
+            4.0,
+            6.0,
+            5.0,
+        ],
+    );
+    let result = a.min_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], 7.0);
+    assert_eq!(result[3], f64::NEG_INFINITY);
+    assert_eq!(result[4], 1.0);
+    assert_eq!(result[5], 4.0);
+    assert_eq!(result[6], 3.0);
+    assert_eq!(result[7], 5.0);
+}
+
 // =============================================================================
 // Shift operations tests (512-bit)
 // =============================================================================
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
index 7f33ebc6f..a40de562c 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
@@ -259,6 +259,44 @@ fn min_precise_f32x8_with_nan<S: Simd>(simd: S) {
     assert_eq!(result[7], 5.0);
 }
 
+#[simd_test]
+fn max_precise_f64x4<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[2.0, -3.0, 0.0, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, -2.0, 7.0, 3.0]);
+    assert_eq!(*a.max_precise(b), [2.0, -2.0, 7.0, 3.0]);
+}
+
+#[simd_test]
+fn min_precise_f64x4<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[2.0, -3.0, 0.0, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, -2.0, 7.0, 3.0]);
+    assert_eq!(*a.min_precise(b), [1.0, -3.0, 0.0, 0.5]);
+}
+
+#[simd_test]
+fn max_precise_f64x4_with_nan<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[f64::NAN, -3.0, f64::INFINITY, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, f64::NAN, 7.0, f64::NEG_INFINITY]);
+    let result = a.max_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], f64::INFINITY);
+    assert_eq!(result[3], 0.5);
+}
+
+#[simd_test]
+fn min_precise_f64x4_with_nan<S: Simd>(simd: S) {
+    let a = f64x4::from_slice(simd, &[f64::NAN, -3.0, f64::INFINITY, 0.5]);
+    let b = f64x4::from_slice(simd, &[1.0, f64::NAN, 7.0, f64::NEG_INFINITY]);
+    let result = a.min_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+    assert_eq!(result[2], 7.0);
+    assert_eq!(result[3], f64::NEG_INFINITY);
+}
+
 #[simd_test]
 fn floor_f32x8<S: Simd>(simd: S) {
     let a = f32x8::from_slice(simd, &[2.0, -3.2, 0.0, 0.5, 1.7, -2.8, 3.1, -4.9]);
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 3716efbce..d4c8dfef4 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -204,6 +204,40 @@ fn min_precise_f32x4_with_nan<S: Simd>(simd: S) {
     assert_eq!(result[3], f32::NEG_INFINITY);
 }
 
+#[simd_test]
+fn max_precise_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[2.0, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, -2.0]);
+    assert_eq!(*a.max_precise(b), [2.0, -2.0]);
+}
+
+#[simd_test]
+fn min_precise_f64x2<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[2.0, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, -2.0]);
+    assert_eq!(*a.min_precise(b), [1.0, -3.0]);
+}
+
+#[simd_test]
+fn max_precise_f64x2_with_nan<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[f64::NAN, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, f64::NAN]);
+    let result = a.max_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+}
+
+#[simd_test]
+fn min_precise_f64x2_with_nan<S: Simd>(simd: S) {
+    let a = f64x2::from_slice(simd, &[f64::NAN, -3.0]);
+    let b = f64x2::from_slice(simd, &[1.0, f64::NAN]);
+    let result = a.min_precise(b);
+
+    assert_eq!(result[0], 1.0);
+    assert_eq!(result[1], -3.0);
+}
+
 #[simd_test]
 fn floor_f32x4<S: Simd>(simd: S) {
     let a = f32x4::from_slice(simd, &[2.0, -3.2, 0.0, 0.5]);

From 608b53fb24763d6056b10f6dd7834538c77f026c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 26 May 2026 02:29:33 +0100
Subject: [PATCH 31/55] Expand interleave/deinterleave test coverage

---
 .../harness/lm_generated/extended_512.rs      | 160 ++++++++++++++++++
 1 file changed, 160 insertions(+)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
index 3e6bbdfb8..f1e03a25b 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/extended_512.rs
@@ -1598,6 +1598,166 @@ fn unzip_high_u32x16<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn zip_unzip_i16x32<S: Simd>(simd: S) {
+    let a = i16x32::from_slice(
+        simd,
+        &[
+            -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4,
+            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+        ],
+    );
+    let b = i16x32::from_slice(
+        simd,
+        &[
+            1000, 999, 998, 997, 996, 995, 994, 993, 992, 991, 990, 989, 988, 987, 986, 985, 984,
+            983, 982, 981, 980, 979, 978, 977, 976, 975, 974, 973, 972, 971, 970, 969,
+        ],
+    );
+
+    assert_eq!(
+        *simd.zip_low_i16x32(a, b),
+        [
+            -16, 1000, -15, 999, -14, 998, -13, 997, -12, 996, -11, 995, -10, 994, -9, 993, -8,
+            992, -7, 991, -6, 990, -5, 989, -4, 988, -3, 987, -2, 986, -1, 985
+        ]
+    );
+    assert_eq!(
+        *simd.zip_high_i16x32(a, b),
+        [
+            0, 984, 1, 983, 2, 982, 3, 981, 4, 980, 5, 979, 6, 978, 7, 977, 8, 976, 9, 975, 10,
+            974, 11, 973, 12, 972, 13, 971, 14, 970, 15, 969
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_low_i16x32(a, b),
+        [
+            -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 1000, 998, 996, 994,
+            992, 990, 988, 986, 984, 982, 980, 978, 976, 974, 972, 970
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_high_i16x32(a, b),
+        [
+            -15, -13, -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15, 999, 997, 995, 993, 991,
+            989, 987, 985, 983, 981, 979, 977, 975, 973, 971, 969
+        ]
+    );
+
+    let (interleaved_low, interleaved_high) = simd.interleave_i16x32(a, b);
+    assert_eq!(
+        *interleaved_low,
+        [
+            -16, 1000, -15, 999, -14, 998, -13, 997, -12, 996, -11, 995, -10, 994, -9, 993, -8,
+            992, -7, 991, -6, 990, -5, 989, -4, 988, -3, 987, -2, 986, -1, 985
+        ]
+    );
+    assert_eq!(
+        *interleaved_high,
+        [
+            0, 984, 1, 983, 2, 982, 3, 981, 4, 980, 5, 979, 6, 978, 7, 977, 8, 976, 9, 975, 10,
+            974, 11, 973, 12, 972, 13, 971, 14, 970, 15, 969
+        ]
+    );
+
+    let (roundtrip_a, roundtrip_b) = simd.deinterleave_i16x32(interleaved_low, interleaved_high);
+    assert_eq!(
+        *roundtrip_a,
+        [
+            -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4,
+            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+        ]
+    );
+    assert_eq!(
+        *roundtrip_b,
+        [
+            1000, 999, 998, 997, 996, 995, 994, 993, 992, 991, 990, 989, 988, 987, 986, 985, 984,
+            983, 982, 981, 980, 979, 978, 977, 976, 975, 974, 973, 972, 971, 970, 969
+        ]
+    );
+}
+
+#[simd_test]
+fn zip_unzip_u16x32<S: Simd>(simd: S) {
+    let a = u16x32::from_slice(
+        simd,
+        &[
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+        ],
+    );
+    let b = u16x32::from_slice(
+        simd,
+        &[
+            1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+            1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027,
+            1028, 1029, 1030, 1031,
+        ],
+    );
+
+    assert_eq!(
+        *simd.zip_low_u16x32(a, b),
+        [
+            0, 1000, 1, 1001, 2, 1002, 3, 1003, 4, 1004, 5, 1005, 6, 1006, 7, 1007, 8, 1008, 9,
+            1009, 10, 1010, 11, 1011, 12, 1012, 13, 1013, 14, 1014, 15, 1015
+        ]
+    );
+    assert_eq!(
+        *simd.zip_high_u16x32(a, b),
+        [
+            16, 1016, 17, 1017, 18, 1018, 19, 1019, 20, 1020, 21, 1021, 22, 1022, 23, 1023, 24,
+            1024, 25, 1025, 26, 1026, 27, 1027, 28, 1028, 29, 1029, 30, 1030, 31, 1031
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_low_u16x32(a, b),
+        [
+            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1000, 1002, 1004, 1006,
+            1008, 1010, 1012, 1014, 1016, 1018, 1020, 1022, 1024, 1026, 1028, 1030
+        ]
+    );
+    assert_eq!(
+        *simd.unzip_high_u16x32(a, b),
+        [
+            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 1001, 1003, 1005, 1007,
+            1009, 1011, 1013, 1015, 1017, 1019, 1021, 1023, 1025, 1027, 1029, 1031
+        ]
+    );
+
+    let (interleaved_low, interleaved_high) = simd.interleave_u16x32(a, b);
+    assert_eq!(
+        *interleaved_low,
+        [
+            0, 1000, 1, 1001, 2, 1002, 3, 1003, 4, 1004, 5, 1005, 6, 1006, 7, 1007, 8, 1008, 9,
+            1009, 10, 1010, 11, 1011, 12, 1012, 13, 1013, 14, 1014, 15, 1015
+        ]
+    );
+    assert_eq!(
+        *interleaved_high,
+        [
+            16, 1016, 17, 1017, 18, 1018, 19, 1019, 20, 1020, 21, 1021, 22, 1022, 23, 1023, 24,
+            1024, 25, 1025, 26, 1026, 27, 1027, 28, 1028, 29, 1029, 30, 1030, 31, 1031
+        ]
+    );
+
+    let (roundtrip_a, roundtrip_b) = simd.deinterleave_u16x32(interleaved_low, interleaved_high);
+    assert_eq!(
+        *roundtrip_a,
+        [
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31
+        ]
+    );
+    assert_eq!(
+        *roundtrip_b,
+        [
+            1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+            1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027,
+            1028, 1029, 1030, 1031
+        ]
+    );
+}
+
 // =============================================================================
 // interleave tests (512-bit)
 // =============================================================================

From b03927fa768f842a6415c1734c20043bfa682539 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 27 May 2026 21:46:07 +0100
Subject: [PATCH 32/55] Apply PR #233 load safety pattern to AVX512

Replace AVX512 interleaved load intrinsics emitted by the branch with checked_transmute_copy, then regenerate the generated AVX512 module.
---
 fearless_simd/src/generated/avx512.rs | 11 +++++++----
 fearless_simd_gen/src/mk_x86.rs       |  9 +++++++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 908845cb3..61118ce60 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -6373,8 +6373,8 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        let lanes: __m512 = crate::transmute::checked_transmute_copy::<[f32; 16usize], __m512>(src);
         unsafe {
-            let lanes = _mm512_loadu_ps(src.as_ptr() as *const _);
             _mm512_permutexvar_ps(
                 _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
                 lanes,
@@ -7251,8 +7251,9 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
+        let lanes: __m512i =
+            crate::transmute::checked_transmute_copy::<[u8; 64usize], __m512i>(src);
         unsafe {
-            let lanes = _mm512_loadu_si512(src.as_ptr() as *const _);
             _mm512_permutexvar_epi8(
                 _mm512_set_epi8(
                     63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54, 50,
@@ -8091,8 +8092,9 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+        let lanes: __m512i =
+            crate::transmute::checked_transmute_copy::<[u16; 32usize], __m512i>(src);
         unsafe {
-            let lanes = _mm512_loadu_si512(src.as_ptr() as *const _);
             _mm512_permutexvar_epi16(
                 _mm512_set_epi16(
                     31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17, 13,
@@ -8891,8 +8893,9 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
+        let lanes: __m512i =
+            crate::transmute::checked_transmute_copy::<[u32; 16usize], __m512i>(src);
         unsafe {
-            let lanes = _mm512_loadu_si512(src.as_ptr() as *const _);
             _mm512_permutexvar_epi32(
                 _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
                 lanes,
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 443e1adf1..4408b621b 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -3136,7 +3136,9 @@ impl X86 {
             512,
             "AVX-512 interleaved loads only specialize 512-bit vectors"
         );
-        let load_unaligned = intrinsic_ident("loadu", coarse_type(vec_ty), vec_ty.n_bits());
+        let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
+        let native_ty = self.arch_ty(vec_ty);
+        let len = vec_ty.len;
         let permute = avx512_permutexvar_intrinsic(vec_ty);
         let indices = avx512_index_vector(
             vec_ty,
@@ -3145,8 +3147,11 @@ impl X86 {
 
         quote! {
             #method_sig {
+                let lanes: #native_ty =
+                    crate::transmute::checked_transmute_copy::<[#scalar_ty; #len], #native_ty>(
+                        src,
+                    );
                 unsafe {
-                    let lanes = #load_unaligned(src.as_ptr() as *const _);
                     #permute(#indices, lanes).simd_into(self)
                 }
             }

From b5de7ff8ac65e78a0557ff1b861ee8f7b9af6647 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 27 May 2026 21:47:39 +0100
Subject: [PATCH 33/55] Apply PR #234 transmute pattern to AVX512

Regenerate the branch-added AVX512 module so by-value transmutes use checked_transmute_copy, matching PR #234.

Validation: cargo test
---
 fearless_simd/src/generated/avx512.rs | 432 ++++++++++----------------
 1 file changed, 168 insertions(+), 264 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 61118ce60..708267d07 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -124,7 +124,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
-        unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128, [f32; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
@@ -146,20 +146,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
-        unsafe {
-            f32x4 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        f32x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -433,7 +429,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
-        unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128i, [i8; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
@@ -455,20 +451,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
-        unsafe {
-            i8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -718,7 +710,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
-        unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128i, [u8; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
@@ -740,20 +732,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -1126,7 +1114,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
-        unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128i, [i16; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
@@ -1148,20 +1136,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
-        unsafe {
-            i16x8 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i16x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -1360,7 +1344,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
-        unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128i, [u16; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
@@ -1382,20 +1366,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
-        unsafe {
-            u16x8 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u16x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -1717,7 +1697,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
-        unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128i, [i32; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
@@ -1739,20 +1719,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
-        unsafe {
-            i32x4 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i32x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -1953,7 +1929,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
-        unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128i, [u32; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
@@ -1975,20 +1951,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
-        unsafe {
-            u32x4 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u32x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -2318,7 +2290,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
-        unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m128d, [f64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
@@ -2340,20 +2312,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
-        unsafe {
-            f64x2 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        f64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
-        unsafe {
-            u8x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -2701,7 +2669,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
-        unsafe { core::mem::transmute::<__m256, [f32; 8usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
@@ -2723,20 +2691,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
-        unsafe {
-            f32x8 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -3095,7 +3059,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
-        unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
@@ -3117,20 +3081,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            i8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -3490,7 +3450,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
-        unsafe { core::mem::transmute::<__m256i, [u8; 32usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
@@ -3512,20 +3472,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -4022,7 +3978,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
-        unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
@@ -4044,20 +4000,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
-        unsafe {
-            i16x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -4344,7 +4296,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
-        unsafe { core::mem::transmute::<__m256i, [u16; 16usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
@@ -4366,20 +4318,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
-        unsafe {
-            u16x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -4807,7 +4755,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
-        unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
@@ -4829,20 +4777,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
-        unsafe {
-            i32x8 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -5117,7 +5061,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
-        unsafe { core::mem::transmute::<__m256i, [u32; 8usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
@@ -5139,20 +5083,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
-        unsafe {
-            u32x8 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -5573,7 +5513,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
-        unsafe { core::mem::transmute::<__m256d, [f64; 4usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
@@ -5595,20 +5535,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
-        unsafe {
-            f64x4 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
-        unsafe {
-            u8x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -6034,7 +5970,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
-        unsafe { core::mem::transmute::<__m512, [f32; 16usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512, [f32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
@@ -6056,20 +5992,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
-        unsafe {
-            f32x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -6455,7 +6387,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
-        unsafe { core::mem::transmute::<__m512i, [i8; 64usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512i, [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
@@ -6477,20 +6409,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            i8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -6874,7 +6802,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
-        unsafe { core::mem::transmute::<__m512i, [u8; 64usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512i, [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
@@ -6896,20 +6824,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -7438,7 +7362,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
-        unsafe { core::mem::transmute::<__m512i, [i16; 32usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512i, [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
@@ -7460,20 +7384,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
-        unsafe {
-            i16x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i16x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -7780,7 +7700,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
-        unsafe { core::mem::transmute::<__m512i, [u16; 32usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512i, [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
@@ -7802,20 +7722,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
-        unsafe {
-            u16x32 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u16x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -8283,7 +8199,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
-        unsafe { core::mem::transmute::<__m512i, [i32; 16usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512i, [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
@@ -8305,20 +8221,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
-        unsafe {
-            i32x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        i32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -8605,7 +8517,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
-        unsafe { core::mem::transmute::<__m512i, [u32; 16usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512i, [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
@@ -8627,20 +8539,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
-        unsafe {
-            u32x16 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
@@ -9074,7 +8982,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
-        unsafe { core::mem::transmute::<__m512d, [f64; 8usize]>(a.val.0) }
+        crate::transmute::checked_transmute_copy::<__m512d, [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
@@ -9096,20 +9004,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
-        unsafe {
-            f64x8 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        f64x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]
     fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
-        unsafe {
-            u8x64 {
-                val: core::mem::transmute(a.val),
-                simd: self,
-            }
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
         }
     }
     #[inline(always)]

From ec4297074c19590884cefbe8b58fa11d8372e2a5 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 27 May 2026 21:49:41 +0100
Subject: [PATCH 34/55] Apply PR #235 reference-cast pattern to AVX512

Regenerate the branch-added AVX512 module so reference casts use checked_cast_ref and checked_cast_mut. Also apply the float bit-pattern assertion style from PR #235 to the branch-added f32x16 interleaved-load test.

Validation: cargo test
---
 fearless_simd/src/generated/avx512.rs    | 96 ++++++++++++------------
 fearless_simd_tests/tests/harness/mod.rs |  9 +--
 2 files changed, 50 insertions(+), 55 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 708267d07..10c2a9658 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -128,11 +128,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
-        unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128, [f32; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
-        unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128, [f32; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
@@ -433,11 +433,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
-        unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128i, [i8; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
-        unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128i, [i8; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
@@ -714,11 +714,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
-        unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128i, [u8; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
-        unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128i, [u8; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
@@ -1118,11 +1118,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
-        unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128i, [i16; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
-        unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128i, [i16; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
@@ -1348,11 +1348,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
-        unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128i, [u16; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
-        unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128i, [u16; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
@@ -1701,11 +1701,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
-        unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128i, [i32; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
-        unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128i, [i32; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
@@ -1933,11 +1933,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
-        unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128i, [u32; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
-        unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128i, [u32; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
@@ -2294,11 +2294,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
-        unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m128d, [f64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
-        unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m128d, [f64; 2usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
@@ -2673,11 +2673,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
-        unsafe { core::mem::transmute::<&__m256, &[f32; 8usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
-        unsafe { core::mem::transmute::<&mut __m256, &mut [f32; 8usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
@@ -3063,11 +3063,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
-        unsafe { core::mem::transmute::<&__m256i, &[i8; 32usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
-        unsafe { core::mem::transmute::<&mut __m256i, &mut [i8; 32usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
@@ -3454,11 +3454,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
-        unsafe { core::mem::transmute::<&__m256i, &[u8; 32usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
-        unsafe { core::mem::transmute::<&mut __m256i, &mut [u8; 32usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
@@ -3982,11 +3982,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
-        unsafe { core::mem::transmute::<&__m256i, &[i16; 16usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
-        unsafe { core::mem::transmute::<&mut __m256i, &mut [i16; 16usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
@@ -4300,11 +4300,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
-        unsafe { core::mem::transmute::<&__m256i, &[u16; 16usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
-        unsafe { core::mem::transmute::<&mut __m256i, &mut [u16; 16usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
@@ -4759,11 +4759,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
-        unsafe { core::mem::transmute::<&__m256i, &[i32; 8usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
-        unsafe { core::mem::transmute::<&mut __m256i, &mut [i32; 8usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
@@ -5065,11 +5065,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
-        unsafe { core::mem::transmute::<&__m256i, &[u32; 8usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
-        unsafe { core::mem::transmute::<&mut __m256i, &mut [u32; 8usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
@@ -5517,11 +5517,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
-        unsafe { core::mem::transmute::<&__m256d, &[f64; 4usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
-        unsafe { core::mem::transmute::<&mut __m256d, &mut [f64; 4usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
@@ -5974,11 +5974,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
-        unsafe { core::mem::transmute::<&__m512, &[f32; 16usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512, [f32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
-        unsafe { core::mem::transmute::<&mut __m512, &mut [f32; 16usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512, [f32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
@@ -6391,11 +6391,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
-        unsafe { core::mem::transmute::<&__m512i, &[i8; 64usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512i, [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
-        unsafe { core::mem::transmute::<&mut __m512i, &mut [i8; 64usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512i, [i8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
@@ -6806,11 +6806,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
-        unsafe { core::mem::transmute::<&__m512i, &[u8; 64usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512i, [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
-        unsafe { core::mem::transmute::<&mut __m512i, &mut [u8; 64usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512i, [u8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
@@ -7366,11 +7366,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
-        unsafe { core::mem::transmute::<&__m512i, &[i16; 32usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512i, [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
-        unsafe { core::mem::transmute::<&mut __m512i, &mut [i16; 32usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512i, [i16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
@@ -7704,11 +7704,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
-        unsafe { core::mem::transmute::<&__m512i, &[u16; 32usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512i, [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
-        unsafe { core::mem::transmute::<&mut __m512i, &mut [u16; 32usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512i, [u16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
@@ -8203,11 +8203,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
-        unsafe { core::mem::transmute::<&__m512i, &[i32; 16usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512i, [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
-        unsafe { core::mem::transmute::<&mut __m512i, &mut [i32; 16usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512i, [i32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
@@ -8521,11 +8521,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
-        unsafe { core::mem::transmute::<&__m512i, &[u32; 16usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512i, [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
-        unsafe { core::mem::transmute::<&mut __m512i, &mut [u32; 16usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512i, [u32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
@@ -8986,11 +8986,11 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
-        unsafe { core::mem::transmute::<&__m512d, &[f64; 8usize]>(&a.val.0) }
+        crate::transmute::checked_cast_ref::<__m512d, [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
     fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
-        unsafe { core::mem::transmute::<&mut __m512d, &mut [f64; 8usize]>(&mut a.val.0) }
+        crate::transmute::checked_cast_mut::<__m512d, [f64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
     fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 424ab6442..d75ec80af 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -907,13 +907,8 @@ fn load_interleaved_128_f32x16<S: Simd>(simd: S) {
         15.0,
     ];
 
-    // Note: f32::NAN != f32::NAN hence we transmute to compare the bit pattern
-    unsafe {
-        assert_eq!(
-            std::mem::transmute::<[f32; 16], [u32; 16]>(*result),
-            std::mem::transmute::<[f32; 16], [u32; 16]>(expected)
-        );
-    }
+    // Note: f32::NAN != f32::NAN hence we compare the bit pattern.
+    assert_eq!((*result).map(f32::to_bits), expected.map(f32::to_bits));
 }
 
 #[simd_test]

From 9ec500c1d5efa0055e7bcb589689c0cf7b406b5b Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 27 May 2026 21:50:37 +0100
Subject: [PATCH 35/55] Record no branch-specific changes for PR #237

PR #237 only updates NEON load construction. The AVX512 branch-specific unsafe load sites were already adapted in the PR #233 follow-up, and a search found no remaining load intrinsics needing the #237 pattern.

From 73e5c96335c3add2d13be52dd10e34b92b2db899 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 12:30:14 +0100
Subject: [PATCH 36/55] Apply PR #239 vectorize safety cleanup to AVX512

---
 fearless_simd/src/generated/avx512.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 10c2a9658..baf0e5d26 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -99,7 +99,7 @@ impl Simd for Avx512 {
         #[target_feature(
             enable = "adx,aes,avx512bitalg,avx512bw,avx512cd,avx512dq,avx512f,avx512ifma,avx512vbmi,avx512vbmi2,avx512vl,avx512vnni,avx512vpopcntdq,bmi1,bmi2,cmpxchg16b,fma,gfni,lzcnt,movbe,pclmulqdq,popcnt,rdrand,rdseed,sha,vaes,vpclmulqdq,xsave,xsavec,xsaveopt,xsaves"
         )]
-        unsafe fn vectorize_avx512<F: FnOnce() -> R, R>(f: F) -> R {
+        fn vectorize_avx512<F: FnOnce() -> R, R>(f: F) -> R {
             f()
         }
         unsafe { vectorize_avx512(f) }

From 815ce0321cfd9ec4f81d820e9b30f2d3b5421b0d Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 12:32:39 +0100
Subject: [PATCH 37/55] Apply PR #238 safe store generation to AVX512

---
 fearless_simd/src/generated/avx512.rs | 192 ++++----------------------
 1 file changed, 24 insertions(+), 168 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index baf0e5d26..eb79a32c8 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -136,13 +136,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const f32,
-                dest.as_mut_ptr(),
-                4usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
@@ -441,13 +435,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i8,
-                dest.as_mut_ptr(),
-                16usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
@@ -722,13 +710,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u8,
-                dest.as_mut_ptr(),
-                16usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
@@ -1126,13 +1108,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i16,
-                dest.as_mut_ptr(),
-                8usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
@@ -1356,13 +1332,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u16,
-                dest.as_mut_ptr(),
-                8usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
@@ -1709,13 +1679,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i32,
-                dest.as_mut_ptr(),
-                4usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
@@ -1941,13 +1905,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u32,
-                dest.as_mut_ptr(),
-                4usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
@@ -2302,13 +2260,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const f64,
-                dest.as_mut_ptr(),
-                2usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
@@ -2681,13 +2633,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const f32,
-                dest.as_mut_ptr(),
-                8usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
@@ -3071,13 +3017,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i8,
-                dest.as_mut_ptr(),
-                32usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
@@ -3462,13 +3402,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u8,
-                dest.as_mut_ptr(),
-                32usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
@@ -3990,13 +3924,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i16,
-                dest.as_mut_ptr(),
-                16usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
@@ -4308,13 +4236,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u16,
-                dest.as_mut_ptr(),
-                16usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
@@ -4767,13 +4689,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i32,
-                dest.as_mut_ptr(),
-                8usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
@@ -5073,13 +4989,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u32,
-                dest.as_mut_ptr(),
-                8usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
@@ -5525,13 +5435,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const f64,
-                dest.as_mut_ptr(),
-                4usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
@@ -5982,13 +5886,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const f32,
-                dest.as_mut_ptr(),
-                16usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
@@ -6399,13 +6297,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i8,
-                dest.as_mut_ptr(),
-                64usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
@@ -6814,13 +6706,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u8,
-                dest.as_mut_ptr(),
-                64usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
@@ -7374,13 +7260,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i16,
-                dest.as_mut_ptr(),
-                32usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
@@ -7712,13 +7592,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u16,
-                dest.as_mut_ptr(),
-                32usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
@@ -8211,13 +8085,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const i32,
-                dest.as_mut_ptr(),
-                16usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
@@ -8529,13 +8397,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const u32,
-                dest.as_mut_ptr(),
-                16usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
@@ -8994,13 +8856,7 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                (&raw const a.val.0) as *const f64,
-                dest.as_mut_ptr(),
-                8usize,
-            );
-        }
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
     fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {

From 3c4bcbca0efcd325a4547edd755b36d96f28407a Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 12:33:52 +0100
Subject: [PATCH 38/55] Record no branch-specific changes for PR #240


From 0847ebf3377fb35cd6447662cebdb3a665f8bc55 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 12:36:28 +0100
Subject: [PATCH 39/55] Record no branch-specific changes for PR #241


From 188740535f77db06a29a37af75231ad369185fb2 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 12:38:05 +0100
Subject: [PATCH 40/55] Record no branch-specific changes for PR #242


From 014e4b74b0ad27d6f13c20394b7c1af168157da7 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 13:11:18 +0100
Subject: [PATCH 41/55] cargo fmt

---
 fearless_simd_gen/src/mk_x86.rs | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 33ec1eaac..8be86e867 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -2781,14 +2781,14 @@ impl X86 {
         if *self == Self::Avx512 {
             let lane_mask = avx512_mask_lane_bits(vec_ty);
             let bits = avx512_mask_bits_expr(quote! { a });
-        let expr = match (quantifier, condition) {
-            (Quantifier::Any, true) => quote! { bits != 0 },
-            (Quantifier::Any, false) => quote! { bits != #lane_mask },
-            (Quantifier::All, true) => quote! { bits == #lane_mask },
-            (Quantifier::All, false) => quote! { bits == 0 },
-        };
-        let method_sig = method_op.simd_trait_method_sig(vec_ty);
-        return quote! {
+            let expr = match (quantifier, condition) {
+                (Quantifier::Any, true) => quote! { bits != 0 },
+                (Quantifier::Any, false) => quote! { bits != #lane_mask },
+                (Quantifier::All, true) => quote! { bits == #lane_mask },
+                (Quantifier::All, false) => quote! { bits == 0 },
+            };
+            let method_sig = method_op.simd_trait_method_sig(vec_ty);
+            return quote! {
             #method_sig {
                 let bits = #bits & #lane_mask;
                 #expr

From d49f6a2ff403ff8941a8ecc962de67bc02342a37 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Thu, 18 Jun 2026 00:26:27 +0100
Subject: [PATCH 42/55] cargo fmt

---
 fearless_simd_gen/src/mk_x86.rs | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 951bd7bed..ca36671be 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -2855,12 +2855,7 @@ impl X86 {
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
         if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
-            return self.handle_avx512_load_interleaved(
-                op,
-                vec_ty,
-                block_size,
-                block_count,
-            );
+            return self.handle_avx512_load_interleaved(op, vec_ty, block_size, block_count);
         }
         match vec_ty.scalar_bits {
             32 | 16 | 8 => {
@@ -3035,12 +3030,7 @@ impl X86 {
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
         if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
-            return self.handle_avx512_store_interleaved(
-                op,
-                vec_ty,
-                block_size,
-                block_count,
-            );
+            return self.handle_avx512_store_interleaved(op, vec_ty, block_size, block_count);
         }
         match vec_ty.scalar_bits {
             32 | 16 | 8 => {

From 046ee30955aed10551401ae98cbf80022e64a6c6 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Fri, 19 Jun 2026 17:26:53 +0100
Subject: [PATCH 43/55] Wrap AVX-512-specific codepaths in kernel! instead of
 unsafe where possible

---
 fearless_simd/src/generated/avx512.rs | 6983 ++++++++++++++++---------
 fearless_simd_gen/src/mk_x86.rs       |  397 +-
 2 files changed, 4636 insertions(+), 2744 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index e1f1761f2..2f73c5fc5 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -214,7 +214,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
-        unsafe { _mm_rcp14_ps(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>) -> f32x4<Avx512> {
+                _mm_rcp14_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
@@ -270,48 +276,68 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmp_ps_mask::<0i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>, b: f32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmp_ps_mask::<17i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>, b: f32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmp_ps_mask::<18i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>, b: f32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmp_ps_mask::<29i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>, b: f32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmp_ps_mask::<30i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>, b: f32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
@@ -383,11 +409,23 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
-        unsafe { _mm_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>, b: f32x4<Avx512>) -> f32x4<Avx512> {
+                _mm_range_ps::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
-        unsafe { _mm_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>, b: f32x4<Avx512>) -> f32x4<Avx512> {
+                _mm_range_ps::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
@@ -469,7 +507,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
-        unsafe { _mm_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x4<Avx512>,
+                b: f32x4<Avx512>,
+                c: f32x4<Avx512>,
+            ) -> f32x4<Avx512> {
+                _mm_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
@@ -523,21 +572,31 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
-        unsafe { _mm_cvttps_epu32(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>) -> u32x4<Avx512> {
+                _mm_cvttps_epu32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
-        unsafe {
-            let a = _mm_max_ps(a.into(), _mm_setzero_ps());
-            let mut converted = _mm_cvttps_epu32(a);
-            let exceeds_unsigned_range = _mm_cmp_ps_mask::<17i32>(_mm_set1_ps(4294967040.0), a);
-            converted = _mm_mask_blend_epi32(
-                exceeds_unsigned_range,
-                converted,
-                _mm_set1_epi32(u32::MAX.cast_signed()),
-            );
-            converted.simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x4<Avx512>) -> u32x4<Avx512> {
+                let a = _mm_max_ps(a.into(), _mm_setzero_ps());
+                let mut converted = _mm_cvttps_epu32(a);
+                let exceeds_unsigned_range = _mm_cmp_ps_mask::<17i32>(_mm_set1_ps(4294967040.0), a);
+                converted = _mm_mask_blend_epi32(
+                    exceeds_unsigned_range,
+                    converted,
+                    _mm_set1_epi32(u32::MAX.cast_signed()),
+                );
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
@@ -739,20 +798,24 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm_setzero_si128();
-            let value_extend = zero;
-            let lo_values = _mm_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm_set1_epi16(0x00ff);
-            let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
-            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> i8x16<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm_setzero_si128();
+                let value_extend = zero;
+                let lo_values = _mm_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm_set1_epi16(0x00ff);
+                let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
@@ -772,65 +835,89 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm_setzero_si128();
-            let value_extend = _mm_cmpgt_epi8(zero, val);
-            let lo_values = _mm_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm_set1_epi16(0x00ff);
-            let lo_shifted = _mm_and_si128(_mm_srav_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm_and_si128(_mm_srav_epi16(hi_values, hi_counts), byte_mask);
-            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> i8x16<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm_setzero_si128();
+                let value_extend = _mm_cmpgt_epi8(zero, val);
+                let lo_values = _mm_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm_set1_epi16(0x00ff);
+                let lo_shifted = _mm_and_si128(_mm_srav_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted = _mm_and_si128(_mm_srav_epi16(hi_values, hi_counts), byte_mask);
+                _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmpeq_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmpeq_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmplt_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmplt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmple_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmple_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmpge_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmpge_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmpgt_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmpgt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -888,7 +975,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
-        unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x16<Avx512>,
+                b: i8x16<Avx512>,
+                c: i8x16<Avx512>,
+            ) -> i8x16<Avx512> {
+                _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -1117,20 +1215,24 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm_setzero_si128();
-            let value_extend = zero;
-            let lo_values = _mm_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm_set1_epi16(0x00ff);
-            let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
-            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> u8x16<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm_setzero_si128();
+                let value_extend = zero;
+                let lo_values = _mm_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm_set1_epi16(0x00ff);
+                let lo_shifted = _mm_and_si128(_mm_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted = _mm_and_si128(_mm_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
@@ -1150,65 +1252,89 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm_setzero_si128();
-            let value_extend = zero;
-            let lo_values = _mm_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm_set1_epi16(0x00ff);
-            let lo_shifted = _mm_and_si128(_mm_srlv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm_and_si128(_mm_srlv_epi16(hi_values, hi_counts), byte_mask);
-            _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> u8x16<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm_setzero_si128();
+                let value_extend = zero;
+                let lo_values = _mm_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm_set1_epi16(0x00ff);
+                let lo_shifted = _mm_and_si128(_mm_srlv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted = _mm_and_si128(_mm_srlv_epi16(hi_values, hi_counts), byte_mask);
+                _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmpeq_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmpeq_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmplt_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmplt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmple_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmple_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmpge_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmpge_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
-        unsafe {
-            mask8x16 {
-                val: _mm_cmpgt_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> mask8x16<Avx512> {
+                mask8x16 {
+                    val: _mm_cmpgt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
@@ -1266,7 +1392,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
-        unsafe { _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x16<Avx512>,
+                b: u8x16<Avx512>,
+                c: u8x16<Avx512>,
+            ) -> u8x16<Avx512> {
+                _mm_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
@@ -1327,20 +1464,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask8x16 {
-                val: _mm_movepi8_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i8; 16usize]) -> mask8x16<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask8x16 {
+                    val: _mm_movepi8_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
-        unsafe {
-            let lanes = _mm_movm_epi8(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask8x16<Avx512>) -> [i8; 16usize] {
+                let lanes = _mm_movm_epi8(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
@@ -1599,7 +1744,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> i16x8<Avx512> {
+                _mm_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
@@ -1613,52 +1764,78 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        unsafe { _mm_srav_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> i16x8<Avx512> {
+                _mm_srav_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmpeq_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmpeq_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmplt_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmplt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmple_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmple_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmpge_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmpge_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmpgt_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmpgt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
@@ -1716,7 +1893,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
-        unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x8<Avx512>,
+                b: i16x8<Avx512>,
+                c: i16x8<Avx512>,
+            ) -> i16x8<Avx512> {
+                _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
@@ -1932,7 +2120,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        unsafe { _mm_sllv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> u16x8<Avx512> {
+                _mm_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
@@ -1946,52 +2140,78 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        unsafe { _mm_srlv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> u16x8<Avx512> {
+                _mm_srlv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmpeq_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmpeq_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmplt_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmplt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmple_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmple_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmpge_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmpge_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
-        unsafe {
-            mask16x8 {
-                val: _mm_cmpgt_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> mask16x8<Avx512> {
+                mask16x8 {
+                    val: _mm_cmpgt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
@@ -2049,7 +2269,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
-        unsafe { _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x8<Avx512>,
+                b: u16x8<Avx512>,
+                c: u16x8<Avx512>,
+            ) -> u16x8<Avx512> {
+                _mm_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
@@ -2110,20 +2341,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask16x8 {
-                val: _mm_movepi16_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i16; 8usize]) -> mask16x8<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask16x8 {
+                    val: _mm_movepi16_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
-        unsafe {
-            let lanes = _mm_movm_epi16(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask16x8<Avx512>) -> [i16; 8usize] {
+                let lanes = _mm_movm_epi16(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
@@ -2412,48 +2651,68 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmpeq_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x4<Avx512>, b: i32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmpeq_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmplt_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x4<Avx512>, b: i32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmplt_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmple_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x4<Avx512>, b: i32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmple_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmpge_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x4<Avx512>, b: i32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmpge_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmpgt_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x4<Avx512>, b: i32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmpgt_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
@@ -2509,7 +2768,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
-        unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x4<Avx512>,
+                b: i32x4<Avx512>,
+                c: i32x4<Avx512>,
+            ) -> i32x4<Avx512> {
+                _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
@@ -2765,48 +3035,68 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmpeq_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x4<Avx512>, b: u32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmpeq_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmplt_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x4<Avx512>, b: u32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmplt_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmple_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x4<Avx512>, b: u32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmple_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmpge_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x4<Avx512>, b: u32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmpge_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
-        unsafe {
-            mask32x4 {
-                val: _mm_cmpgt_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x4<Avx512>, b: u32x4<Avx512>) -> mask32x4<Avx512> {
+                mask32x4 {
+                    val: _mm_cmpgt_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
@@ -2862,7 +3152,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
-        unsafe { _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x4<Avx512>,
+                b: u32x4<Avx512>,
+                c: u32x4<Avx512>,
+            ) -> u32x4<Avx512> {
+                _mm_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
@@ -2932,20 +3233,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask32x4 {
-                val: _mm_movepi32_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i32; 4usize]) -> mask32x4<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask32x4 {
+                    val: _mm_movepi32_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
-        unsafe {
-            let lanes = _mm_movm_epi32(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask32x4<Avx512>) -> [i32; 4usize] {
+                let lanes = _mm_movm_epi32(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
@@ -3160,7 +3469,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
-        unsafe { _mm_rcp14_pd(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>) -> f64x2<Avx512> {
+                _mm_rcp14_pd(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
@@ -3216,51 +3531,71 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
-        unsafe {
-            mask64x2 {
-                val: _mm_cmp_pd_mask::<0i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
-        unsafe {
-            mask64x2 {
-                val: _mm_cmp_pd_mask::<17i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
-        unsafe {
-            mask64x2 {
-                val: _mm_cmp_pd_mask::<18i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
-        unsafe {
-            mask64x2 {
-                val: _mm_cmp_pd_mask::<29i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
-        unsafe {
-            mask64x2 {
-                val: _mm_cmp_pd_mask::<30i32>(a.into(), b.into()),
-                simd: self,
-            }
-        }
-    }
-    #[inline(always)]
-    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> f64x2<Avx512> {
@@ -3329,11 +3664,23 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
-        unsafe { _mm_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> f64x2<Avx512> {
+                _mm_range_pd::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
-        unsafe { _mm_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x2<Avx512>, b: f64x2<Avx512>) -> f64x2<Avx512> {
+                _mm_range_pd::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
@@ -3415,7 +3762,18 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
-        unsafe { _mm_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask64x2<Avx512>,
+                b: f64x2<Avx512>,
+                c: f64x2<Avx512>,
+            ) -> f64x2<Avx512> {
+                _mm_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
@@ -3446,20 +3804,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask64x2 {
-                val: _mm_movepi64_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i64; 2usize]) -> mask64x2<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask64x2 {
+                    val: _mm_movepi64_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
-        unsafe {
-            let lanes = _mm_movm_epi64(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask64x2<Avx512>) -> [i64; 2usize] {
+                let lanes = _mm_movm_epi64(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
@@ -3620,27 +3986,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT * 4usize) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_f32x8(a).val.0,
-                idx,
-                self.cvt_to_bytes_f32x8(b).val.0,
-            );
-            self.cvt_from_bytes_f32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+                shift: usize,
+            ) -> f32x8<Avx512> {
+                if shift >= 8usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_f32x8(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_f32x8(b).val.0,
+                );
+                token.cvt_from_bytes_f32x8(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_f32x8<const SHIFT: usize>(
@@ -3694,7 +4069,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        unsafe { _mm256_rcp14_ps(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_rcp14_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
@@ -3753,118 +4134,170 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_ps(
-                a.into(),
-                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_ps(
-                a.into(),
-                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_ps(
-                a.into(),
-                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_ps(
-                a.into(),
-                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
-                    .simd_into(self),
-                _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+            ) -> (f32x8<Avx512>, f32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
-                    .simd_into(self),
-                _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+            ) -> (f32x8<Avx512>, f32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
@@ -3888,11 +4321,23 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe { _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe { _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
@@ -3974,13 +4419,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        unsafe { _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x8<Avx512>,
+                b: f32x8<Avx512>,
+                c: f32x8<Avx512>,
+            ) -> f32x8<Avx512> {
+                _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x16<Avx512> {
+                _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
@@ -4037,22 +4497,32 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        unsafe { _mm256_cvttps_epu32(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_cvttps_epu32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
-            let mut converted = _mm256_cvttps_epu32(a);
-            let exceeds_unsigned_range =
-                _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a);
-            converted = _mm256_mask_blend_epi32(
-                exceeds_unsigned_range,
-                converted,
-                _mm256_set1_epi32(u32::MAX.cast_signed()),
-            );
-            converted.simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
+                let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
+                let mut converted = _mm256_cvttps_epu32(a);
+                let exceeds_unsigned_range =
+                    _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a);
+                converted = _mm256_mask_blend_epi32(
+                    exceeds_unsigned_range,
+                    converted,
+                    _mm256_set1_epi32(u32::MAX.cast_signed()),
+                );
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
@@ -4143,27 +4613,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_i8x32(a).val.0,
-                idx,
-                self.cvt_to_bytes_i8x32(b).val.0,
-            );
-            self.cvt_from_bytes_i8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x32<Avx512>,
+                b: i8x32<Avx512>,
+                shift: usize,
+            ) -> i8x32<Avx512> {
+                if shift >= 32usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_i8x32(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i8x32(b).val.0,
+                );
+                token.cvt_from_bytes_i8x32(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_i8x32<const SHIFT: usize>(
@@ -4278,20 +4757,26 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm256_setzero_si256();
-            let value_extend = zero;
-            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm256_set1_epi16(0x00ff);
-            let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
-            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = zero;
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
@@ -4313,179 +4798,248 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm256_setzero_si256();
-            let value_extend = _mm256_cmpgt_epi8(zero, val);
-            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm256_set1_epi16(0x00ff);
-            let lo_shifted = _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask);
-            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = _mm256_cmpgt_epi8(zero, val);
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmpeq_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpeq_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmplt_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmplt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmple_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmple_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmpge_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpge_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmpgt_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpgt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42,
-                    11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57,
-                    26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40,
-                    42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
-                    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
                         42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
                         57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
                         40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
                         41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x32<Avx512>,
+                b: i8x32<Avx512>,
+            ) -> (i8x32<Avx512>, i8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41,
+                            10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56,
+                            25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x32<Avx512>,
+                b: i8x32<Avx512>,
+            ) -> (i8x32<Avx512>, i8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36,
+                            38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37,
+                            39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
-        unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x32<Avx512>,
+                b: i8x32<Avx512>,
+                c: i8x32<Avx512>,
+            ) -> i8x32<Avx512> {
+                _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
@@ -4509,9 +5063,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
-        unsafe {
-            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x64<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
@@ -4612,27 +5170,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_u8x32(a).val.0,
-                idx,
-                self.cvt_to_bytes_u8x32(b).val.0,
-            );
-            self.cvt_from_bytes_u8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x32<Avx512>,
+                b: u8x32<Avx512>,
+                shift: usize,
+            ) -> u8x32<Avx512> {
+                if shift >= 32usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_u8x32(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u8x32(b).val.0,
+                );
+                token.cvt_from_bytes_u8x32(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_u8x32<const SHIFT: usize>(
@@ -4745,20 +5312,26 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm256_setzero_si256();
-            let value_extend = zero;
-            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm256_set1_epi16(0x00ff);
-            let lo_shifted = _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
-            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = zero;
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
@@ -4778,179 +5351,248 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm256_setzero_si256();
-            let value_extend = zero;
-            let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm256_set1_epi16(0x00ff);
-            let lo_shifted = _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask);
-            _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = zero;
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmpeq_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpeq_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmplt_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmplt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmple_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmple_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmpge_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpge_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        unsafe {
-            mask8x32 {
-                val: _mm256_cmpgt_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpgt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42,
-                    11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57,
-                    26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40,
-                    42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            _mm256_permutex2var_epi8(
-                a.into(),
-                _mm256_setr_epi8(
-                    1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
-                    43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
                         42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
                         57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
                         40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
                 _mm256_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm256_setr_epi8(
                         1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
                         41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x32<Avx512>,
+                b: u8x32<Avx512>,
+            ) -> (u8x32<Avx512>, u8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41,
+                            10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56,
+                            25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x32<Avx512>,
+                b: u8x32<Avx512>,
+            ) -> (u8x32<Avx512>, u8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36,
+                            38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37,
+                            39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
-        unsafe { _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x32<Avx512>,
+                b: u8x32<Avx512>,
+                c: u8x32<Avx512>,
+            ) -> u8x32<Avx512> {
+                _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
@@ -4974,9 +5616,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
-        unsafe {
-            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x64<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
@@ -5020,20 +5666,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask8x32 {
-                val: _mm256_movepi8_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i8; 32usize]) -> mask8x32<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask8x32 {
+                    val: _mm256_movepi8_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
-        unsafe {
-            let lanes = _mm256_movm_epi8(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask8x32<Avx512>) -> [i8; 32usize] {
+                let lanes = _mm256_movm_epi8(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
@@ -5208,27 +5862,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT * 2usize) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_i16x16(a).val.0,
-                idx,
-                self.cvt_to_bytes_i16x16(b).val.0,
-            );
-            self.cvt_from_bytes_i16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x16<Avx512>,
+                b: i16x16<Avx512>,
+                shift: usize,
+            ) -> i16x16<Avx512> {
+                if shift >= 16usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 2usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_i16x16(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i16x16(b).val.0,
+                );
+                token.cvt_from_bytes_i16x16(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_i16x16<const SHIFT: usize>(
@@ -5326,7 +5989,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
@@ -5340,142 +6009,217 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe { _mm256_srav_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_srav_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmpeq_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpeq_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmplt_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmplt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmple_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmple_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmpge_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpge_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmpgt_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpgt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                    b,
-                )
-                .simd_into(self),
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x16<Avx512>,
+                b: i16x16<Avx512>,
+            ) -> (i16x16<Avx512>, i16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b,
-                )
-                .simd_into(self),
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b,
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x16<Avx512>,
+                b: i16x16<Avx512>,
+            ) -> (i16x16<Avx512>, i16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
-        unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x16<Avx512>,
+                b: i16x16<Avx512>,
+                c: i16x16<Avx512>,
+            ) -> i16x16<Avx512> {
+                _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
@@ -5499,9 +6243,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
-        unsafe {
-            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x32<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
@@ -5602,27 +6350,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT * 2usize) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_u16x16(a).val.0,
-                idx,
-                self.cvt_to_bytes_u16x16(b).val.0,
-            );
-            self.cvt_from_bytes_u16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x16<Avx512>,
+                b: u16x16<Avx512>,
+                shift: usize,
+            ) -> u16x16<Avx512> {
+                if shift >= 16usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 2usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_u16x16(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u16x16(b).val.0,
+                );
+                token.cvt_from_bytes_u16x16(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_u16x16<const SHIFT: usize>(
@@ -5720,7 +6477,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe { _mm256_sllv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
@@ -5734,142 +6497,217 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe { _mm256_srlv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_srlv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmpeq_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpeq_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmplt_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmplt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmple_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmple_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmpge_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpge_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        unsafe {
-            mask16x16 {
-                val: _mm256_cmpgt_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpgt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe {
-            _mm256_permutex2var_epi16(
-                a.into(),
-                _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                    b,
-                )
-                .simd_into(self),
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x16<Avx512>,
+                b: u16x16<Avx512>,
+            ) -> (u16x16<Avx512>, u16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b,
-                )
-                .simd_into(self),
-                _mm256_permutex2var_epi16(
-                    a,
-                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x16<Avx512>,
+                b: u16x16<Avx512>,
+            ) -> (u16x16<Avx512>, u16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
-        unsafe { _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x16<Avx512>,
+                b: u16x16<Avx512>,
+                c: u16x16<Avx512>,
+            ) -> u16x16<Avx512> {
+                _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
@@ -5893,9 +6731,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
-        unsafe {
-            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x32<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
@@ -5949,20 +6791,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask16x16 {
-                val: _mm256_movepi16_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i16; 16usize]) -> mask16x16<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask16x16 {
+                    val: _mm256_movepi16_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
-        unsafe {
-            let lanes = _mm256_movm_epi16(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask16x16<Avx512>) -> [i16; 16usize] {
+                let lanes = _mm256_movm_epi16(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
@@ -6137,27 +6987,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT * 4usize) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_i32x8(a).val.0,
-                idx,
-                self.cvt_to_bytes_i32x8(b).val.0,
-            );
-            self.cvt_from_bytes_i32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x8<Avx512>,
+                b: i32x8<Avx512>,
+                shift: usize,
+            ) -> i32x8<Avx512> {
+                if shift >= 8usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_i32x8(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i32x8(b).val.0,
+                );
+                token.cvt_from_bytes_i32x8(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_i32x8<const SHIFT: usize>(
@@ -6285,122 +7144,185 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmpeq_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpeq_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmplt_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmplt_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmple_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmple_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmpge_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpge_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmpgt_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpgt_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                    b.into(),
+                )
+                .simd_into(token)
             }
-        }
-    }
-    #[inline(always)]
-    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
-                    .simd_into(self),
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x8<Avx512>,
+                b: i32x8<Avx512>,
+            ) -> (i32x8<Avx512>, i32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
-                    .simd_into(self),
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x8<Avx512>,
+                b: i32x8<Avx512>,
+            ) -> (i32x8<Avx512>, i32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
-        unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x8<Avx512>,
+                b: i32x8<Avx512>,
+                c: i32x8<Avx512>,
+            ) -> i32x8<Avx512> {
+                _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
@@ -6424,9 +7346,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
-        unsafe {
-            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x16<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
@@ -6537,27 +7463,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT * 4usize) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_u32x8(a).val.0,
-                idx,
-                self.cvt_to_bytes_u32x8(b).val.0,
-            );
-            self.cvt_from_bytes_u32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u32x8<Avx512>,
+                b: u32x8<Avx512>,
+                shift: usize,
+            ) -> u32x8<Avx512> {
+                if shift >= 8usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_u32x8(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u32x8(b).val.0,
+                );
+                token.cvt_from_bytes_u32x8(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_u32x8<const SHIFT: usize>(
@@ -6685,122 +7620,185 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmpeq_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpeq_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmplt_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmplt_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmple_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmple_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmpge_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpge_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        unsafe {
-            mask32x8 {
-                val: _mm256_cmpgt_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpgt_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            _mm256_permutex2var_epi32(
-                a.into(),
-                _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
-                    .simd_into(self),
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u32x8<Avx512>,
+                b: u32x8<Avx512>,
+            ) -> (u32x8<Avx512>, u32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
-                    .simd_into(self),
-                _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u32x8<Avx512>,
+                b: u32x8<Avx512>,
+            ) -> (u32x8<Avx512>, u32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
-        unsafe { _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x8<Avx512>,
+                b: u32x8<Avx512>,
+                c: u32x8<Avx512>,
+            ) -> u32x8<Avx512> {
+                _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
@@ -6824,9 +7822,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
-        unsafe {
-            _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x16<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
@@ -6881,20 +7883,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask32x8 {
-                val: _mm256_movepi32_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i32; 8usize]) -> mask32x8<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask32x8 {
+                    val: _mm256_movepi32_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
-        unsafe {
-            let lanes = _mm256_movm_epi32(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask32x8<Avx512>) -> [i32; 8usize] {
+                let lanes = _mm256_movm_epi32(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
@@ -7069,27 +8079,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let idx = _mm256_add_epi8(
-                _mm256_setr_epi8(
-                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                    22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                ),
-                _mm256_set1_epi8((SHIFT * 8usize) as i8),
-            );
-            let result = _mm256_permutex2var_epi8(
-                self.cvt_to_bytes_f64x4(a).val.0,
-                idx,
-                self.cvt_to_bytes_f64x4(b).val.0,
-            );
-            self.cvt_from_bytes_f64x4(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
+                shift: usize,
+            ) -> f64x4<Avx512> {
+                if shift >= 4usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 8usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_f64x4(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_f64x4(b).val.0,
+                );
+                token.cvt_from_bytes_f64x4(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_f64x4<const SHIFT: usize>(
@@ -7143,7 +8162,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        unsafe { _mm256_rcp14_pd(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_rcp14_pd(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
@@ -7202,98 +8227,150 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        unsafe {
-            mask64x4 {
-                val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        unsafe {
-            mask64x4 {
-                val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        unsafe {
-            mask64x4 {
-                val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        unsafe {
-            mask64x4 {
-                val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        unsafe {
-            mask64x4 {
-                val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(self),
-                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
+            ) -> (f64x4<Avx512>, f64x4<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(token),
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(self),
-                _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
+            ) -> (f64x4<Avx512>, f64x4<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(token),
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
@@ -7317,11 +8394,23 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe { _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe { _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
@@ -7403,13 +8492,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        unsafe { _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask64x4<Avx512>,
+                b: f64x4<Avx512>,
+                c: f64x4<Avx512>,
+            ) -> f64x4<Avx512> {
+                _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x8<Avx512> {
+                _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
@@ -7443,20 +8547,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask64x4 {
-                val: _mm256_movepi64_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i64; 4usize]) -> mask64x4<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask64x4 {
+                    val: _mm256_movepi64_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
-        unsafe {
-            let lanes = _mm256_movm_epi64(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask64x4<Avx512>) -> [i64; 4usize] {
+                let lanes = _mm256_movm_epi64(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
@@ -7631,29 +8743,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT * 4usize) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_f32x16(a).val.0,
-                idx,
-                self.cvt_to_bytes_f32x16(b).val.0,
-            );
-            self.cvt_from_bytes_f32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
+                shift: usize,
+            ) -> f32x16<Avx512> {
+                if shift >= 16usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_f32x16(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_f32x16(b).val.0,
+                );
+                token.cvt_from_bytes_f32x16(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_f32x16<const SHIFT: usize>(
@@ -7707,7 +8828,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        unsafe { _mm512_rcp14_ps(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_rcp14_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
@@ -7766,134 +8893,192 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_ps(
-                a.into(),
-                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
+                    a.into(),
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_ps(
-                a.into(),
-                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
+                    a.into(),
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_ps(
-                a.into(),
-                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
+                    a.into(),
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_ps(
-                a.into(),
-                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
+                    a.into(),
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_ps(
-                    a,
-                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                    b,
-                )
-                .simd_into(self),
-                _mm512_permutex2var_ps(
-                    a,
-                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
+            ) -> (f32x16<Avx512>, f32x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_ps(
-                    a,
-                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b,
-                )
-                .simd_into(self),
-                _mm512_permutex2var_ps(
-                    a,
-                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
+            ) -> (f32x16<Avx512>, f32x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
@@ -7917,11 +9102,23 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe { _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe { _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
@@ -7955,24 +9152,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
@@ -7980,23 +9189,42 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        unsafe { _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x16<Avx512>,
+                b: f32x16<Avx512>,
+                c: f32x16<Avx512>,
+            ) -> f32x16<Avx512> {
+                _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        unsafe {
-            (
-                _mm512_castps512_ps256(a.into()).simd_into(self),
-                _mm512_extractf32x8_ps::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> (f32x8<Avx512>, f32x8<Avx512>) {
+                (
+                    _mm512_castps512_ps256(a.into()).simd_into(token),
+                    _mm512_extractf32x8_ps::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
@@ -8070,38 +9298,59 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        unsafe { _mm512_cvttps_epu32(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_cvttps_epu32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            let a = _mm512_max_ps(a.into(), _mm512_setzero_ps());
-            let mut converted = _mm512_cvttps_epu32(a);
-            let exceeds_unsigned_range =
-                _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a);
-            converted = _mm512_mask_blend_epi32(
-                exceeds_unsigned_range,
-                converted,
-                _mm512_set1_epi32(u32::MAX.cast_signed()),
-            );
-            converted.simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
+                let a = _mm512_max_ps(a.into(), _mm512_setzero_ps());
+                let mut converted = _mm512_cvttps_epu32(a);
+                let exceeds_unsigned_range =
+                    _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a);
+                converted = _mm512_mask_blend_epi32(
+                    exceeds_unsigned_range,
+                    converted,
+                    _mm512_set1_epi32(u32::MAX.cast_signed()),
+                );
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        unsafe { _mm512_cvttps_epi32(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_cvttps_epi32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            let a = a.into();
-            let mut converted = _mm512_cvttps_epi32(a);
-            let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0));
-            converted = _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted);
-            let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a);
-            converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted);
-            converted.simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
+                let a = a.into();
+                let mut converted = _mm512_cvttps_epi32(a);
+                let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0));
+                converted =
+                    _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted);
+                let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a);
+                converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted);
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn splat_i8x64(self, val: i8) -> i8x64<Self> {
@@ -8159,29 +9408,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            if SHIFT >= 64usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_i8x64(a).val.0,
-                idx,
-                self.cvt_to_bytes_i8x64(b).val.0,
-            );
-            self.cvt_from_bytes_i8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x64<Avx512>,
+                b: i8x64<Avx512>,
+                shift: usize,
+            ) -> i8x64<Avx512> {
+                if shift >= 64usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_i8x64(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i8x64(b).val.0,
+                );
+                token.cvt_from_bytes_i8x64(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_i8x64<const SHIFT: usize>(
@@ -8300,20 +9558,26 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm512_setzero_si512();
-            let value_extend = zero;
-            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm512_set1_epi16(0x00ff);
-            let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
-            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = zero;
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
@@ -8339,195 +9603,264 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm512_setzero_si512();
-            let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val));
-            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm512_set1_epi16(0x00ff);
-            let lo_shifted = _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask);
-            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val));
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmpeq_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpeq_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmplt_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmplt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmple_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmple_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmpge_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpge_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmpgt_epi8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpgt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22,
-                    85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12,
-                    75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1,
-                    64, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
-                    55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
-                    110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
-                    38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
-                    94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56,
-                    54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16,
-                    14, 12, 10, 8, 6, 4, 2, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
-                    95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57,
-                    55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17,
-                    15, 13, 11, 9, 7, 5, 3, 1,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
                         22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
                         76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
                         66, 2, 65, 1, 64, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
                         119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
                         111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
                         103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
                         96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
                         58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
                         20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
                         97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
                         59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
                         21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x64<Avx512>,
+                b: i8x64<Avx512>,
+            ) -> (i8x64<Avx512>, i8x64<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23,
+                            86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14,
+                            77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68,
+                            4, 67, 3, 66, 2, 65, 1, 64, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                            119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                            111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                            103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x64<Avx512>,
+                b: i8x64<Avx512>,
+            ) -> (i8x64<Avx512>, i8x64<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100,
+                            98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
+                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
+                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101,
+                            99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
+                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
+                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
-        unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x64<Avx512>,
+                b: i8x64<Avx512>,
+                c: i8x64<Avx512>,
+            ) -> i8x64<Avx512> {
+                _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
@@ -8551,12 +9884,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        unsafe {
-            (
-                _mm512_castsi512_si256(a.into()).simd_into(self),
-                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x64<Avx512>) -> (i8x32<Avx512>, i8x32<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
@@ -8644,29 +9981,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            if SHIFT >= 64usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_u8x64(a).val.0,
-                idx,
-                self.cvt_to_bytes_u8x64(b).val.0,
-            );
-            self.cvt_from_bytes_u8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x64<Avx512>,
+                b: u8x64<Avx512>,
+                shift: usize,
+            ) -> u8x64<Avx512> {
+                if shift >= 64usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_u8x64(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u8x64(b).val.0,
+                );
+                token.cvt_from_bytes_u8x64(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_u8x64<const SHIFT: usize>(
@@ -8779,20 +10125,26 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm512_setzero_si512();
-            let value_extend = zero;
-            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm512_set1_epi16(0x00ff);
-            let lo_shifted = _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
-            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = zero;
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
@@ -8812,195 +10164,264 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            let val = a.into();
-            let counts = b.into();
-            let zero = _mm512_setzero_si512();
-            let value_extend = zero;
-            let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-            let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-            let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-            let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-            let byte_mask = _mm512_set1_epi16(0x00ff);
-            let lo_shifted = _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask);
-            let hi_shifted = _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask);
-            _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = zero;
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmpeq_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpeq_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmplt_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmplt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmple_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmple_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmpge_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpge_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        unsafe {
-            mask8x64 {
-                val: _mm512_cmpgt_epu8_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpgt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86, 22,
-                    85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13, 76, 12,
-                    75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1,
-                    64, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, 119,
-                    55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, 111, 47,
-                    110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, 103, 39, 102,
-                    38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
-                    94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60, 58, 56,
-                    54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16,
-                    14, 12, 10, 8, 6, 4, 2, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            _mm512_permutex2var_epi8(
-                a.into(),
-                _mm512_set_epi8(
-                    127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
-                    95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61, 59, 57,
-                    55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23, 21, 19, 17,
-                    15, 13, 11, 9, 7, 5, 3, 1,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
                         22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
                         76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
                         66, 2, 65, 1, 64, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
                         119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
                         111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
                         103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
                         96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
                         58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
                         20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
                 _mm512_permutex2var_epi8(
-                    a,
+                    a.into(),
                     _mm512_set_epi8(
                         127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
                         97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
                         59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
                         21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x64<Avx512>,
+                b: u8x64<Avx512>,
+            ) -> (u8x64<Avx512>, u8x64<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23,
+                            86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14,
+                            77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68,
+                            4, 67, 3, 66, 2, 65, 1, 64, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                            119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                            111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                            103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x64<Avx512>,
+                b: u8x64<Avx512>,
+            ) -> (u8x64<Avx512>, u8x64<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100,
+                            98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
+                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
+                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101,
+                            99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
+                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
+                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
-        unsafe { _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x64<Avx512>,
+                b: u8x64<Avx512>,
+                c: u8x64<Avx512>,
+            ) -> u8x64<Avx512> {
+                _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
@@ -9024,12 +10445,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        unsafe {
-            (
-                _mm512_castsi512_si256(a.into()).simd_into(self),
-                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>) -> (u8x32<Avx512>, u8x32<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
@@ -9090,20 +10515,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask8x64 {
-                val: _mm512_movepi8_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i8; 64usize]) -> mask8x64<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask8x64 {
+                    val: _mm512_movepi8_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
-        unsafe {
-            let lanes = _mm512_movm_epi8(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask8x64<Avx512>) -> [i8; 64usize] {
+                let lanes = _mm512_movm_epi8(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
@@ -9270,29 +10703,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT * 2usize) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_i16x32(a).val.0,
-                idx,
-                self.cvt_to_bytes_i16x32(b).val.0,
-            );
-            self.cvt_from_bytes_i16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x32<Avx512>,
+                b: i16x32<Avx512>,
+                shift: usize,
+            ) -> i16x32<Avx512> {
+                if shift >= 32usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift * 2usize) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_i16x32(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i16x32(b).val.0,
+                );
+                token.cvt_from_bytes_i16x32(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_i16x32<const SHIFT: usize>(
@@ -9390,7 +10832,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
@@ -9404,166 +10852,235 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe { _mm512_srav_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_srav_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmpeq_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpeq_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmplt_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmplt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmple_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmple_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmpge_epi16_mask(a.into(), b.into()),
-                simd: self,
-            }
-        }
-    }
-    #[inline(always)]
-    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmpgt_epi16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpge_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
-    }
-    #[inline(always)]
-    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37,
-                    5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22,
-                    53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24,
-                    22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        );
+        kernel(self, a, b)
     }
-    #[inline(always)]
-    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25,
-                    23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+    #[inline(always)]
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpgt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
                         37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
                         22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
                         24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
                         25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x32<Avx512>,
+                b: i16x32<Avx512>,
+            ) -> (i16x32<Avx512>, i16x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7,
+                            38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23,
+                            54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x32<Avx512>,
+                b: i16x32<Avx512>,
+            ) -> (i16x32<Avx512>, i16x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
+                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
+                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
-        unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x32<Avx512>,
+                b: i16x32<Avx512>,
+                c: i16x32<Avx512>,
+            ) -> i16x32<Avx512> {
+                _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
@@ -9587,12 +11104,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        unsafe {
-            (
-                _mm512_castsi512_si256(a.into()).simd_into(self),
-                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x32<Avx512>) -> (i16x16<Avx512>, i16x16<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
@@ -9680,29 +11201,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT * 2usize) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_u16x32(a).val.0,
-                idx,
-                self.cvt_to_bytes_u16x32(b).val.0,
-            );
-            self.cvt_from_bytes_u16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x32<Avx512>,
+                b: u16x32<Avx512>,
+                shift: usize,
+            ) -> u16x32<Avx512> {
+                if shift >= 32usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift * 2usize) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_u16x32(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u16x32(b).val.0,
+                );
+                token.cvt_from_bytes_u16x32(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_u16x32<const SHIFT: usize>(
@@ -9800,7 +11330,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe { _mm512_sllv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
@@ -9814,166 +11350,235 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe { _mm512_srlv_epi16(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_srlv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmpeq_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpeq_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmplt_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmplt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmple_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmple_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmpge_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpge_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        unsafe {
-            mask16x32 {
-                val: _mm512_cmpgt_epu16_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpgt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6, 37,
-                    5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54, 22,
-                    53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24,
-                    22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe {
-            _mm512_permutex2var_epi16(
-                a.into(),
-                _mm512_set_epi16(
-                    63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25,
-                    23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
-                ),
-                b.into(),
-            )
-            .simd_into(self)
-        }
-    }
-    #[inline(always)]
-    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
                         37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
                         22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
                         24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
                 _mm512_permutex2var_epi16(
-                    a,
+                    a.into(),
                     _mm512_set_epi16(
                         63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
                         25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                     ),
-                    b,
+                    b.into(),
                 )
-                .simd_into(self),
-            )
-        }
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x32<Avx512>,
+                b: u16x32<Avx512>,
+            ) -> (u16x32<Avx512>, u16x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7,
+                            38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23,
+                            54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x32<Avx512>,
+                b: u16x32<Avx512>,
+            ) -> (u16x32<Avx512>, u16x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
+                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi16(
+                        a,
+                        _mm512_set_epi16(
+                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
+                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
-        unsafe { _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x32<Avx512>,
+                b: u16x32<Avx512>,
+                c: u16x32<Avx512>,
+            ) -> u16x32<Avx512> {
+                _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
@@ -9997,12 +11602,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        unsafe {
-            (
-                _mm512_castsi512_si256(a.into()).simd_into(self),
-                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>) -> (u16x16<Avx512>, u16x16<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
@@ -10079,20 +11688,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask16x32 {
-                val: _mm512_movepi16_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i16; 32usize]) -> mask16x32<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask16x32 {
+                    val: _mm512_movepi16_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
-        unsafe {
-            let lanes = _mm512_movm_epi16(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask16x32<Avx512>) -> [i16; 32usize] {
+                let lanes = _mm512_movm_epi16(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
@@ -10259,29 +11876,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT * 4usize) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_i32x16(a).val.0,
-                idx,
-                self.cvt_to_bytes_i32x16(b).val.0,
-            );
-            self.cvt_from_bytes_i32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x16<Avx512>,
+                b: i32x16<Avx512>,
+                shift: usize,
+            ) -> i32x16<Avx512> {
+                if shift >= 16usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_i32x16(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i32x16(b).val.0,
+                );
+                token.cvt_from_bytes_i32x16(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_i32x16<const SHIFT: usize>(
@@ -10409,138 +12035,207 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmpeq_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpeq_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmplt_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmplt_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmple_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmple_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmpge_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpge_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmpgt_epi32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpgt_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                    b,
-                )
-                .simd_into(self),
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x16<Avx512>,
+                b: i32x16<Avx512>,
+            ) -> (i32x16<Avx512>, i32x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b,
-                )
-                .simd_into(self),
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x16<Avx512>,
+                b: i32x16<Avx512>,
+            ) -> (i32x16<Avx512>, i32x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
-        unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x16<Avx512>,
+                b: i32x16<Avx512>,
+                c: i32x16<Avx512>,
+            ) -> i32x16<Avx512> {
+                _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
@@ -10564,12 +12259,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        unsafe {
-            (
-                _mm512_castsi512_si256(a.into()).simd_into(self),
-                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>) -> (i32x8<Avx512>, i32x8<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
@@ -10603,7 +12302,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
-        unsafe { _mm512_cvtepi32_ps(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_cvtepi32_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn splat_u32x16(self, val: u32) -> u32x16<Self> {
@@ -10661,29 +12366,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT * 4usize) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_u32x16(a).val.0,
-                idx,
-                self.cvt_to_bytes_u32x16(b).val.0,
-            );
-            self.cvt_from_bytes_u32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u32x16<Avx512>,
+                b: u32x16<Avx512>,
+                shift: usize,
+            ) -> u32x16<Avx512> {
+                if shift >= 16usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_u32x16(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u32x16(b).val.0,
+                );
+                token.cvt_from_bytes_u32x16(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_u32x16<const SHIFT: usize>(
@@ -10811,138 +12525,207 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmpeq_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpeq_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmplt_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmplt_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmple_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmple_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmpge_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpge_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        unsafe {
-            mask32x16 {
-                val: _mm512_cmpgt_epu32_mask(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpgt_epu32_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            _mm512_permutex2var_epi32(
-                a.into(),
-                _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                    b,
-                )
-                .simd_into(self),
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u32x16<Avx512>,
+                b: u32x16<Avx512>,
+            ) -> (u32x16<Avx512>, u32x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b,
-                )
-                .simd_into(self),
-                _mm512_permutex2var_epi32(
-                    a,
-                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u32x16<Avx512>,
+                b: u32x16<Avx512>,
+            ) -> (u32x16<Avx512>, u32x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi32(
+                        a,
+                        _mm512_setr_epi32(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
-                .simd_into(self),
-            )
-        }
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
-        unsafe { _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x16<Avx512>,
+                b: u32x16<Avx512>,
+                c: u32x16<Avx512>,
+            ) -> u32x16<Avx512> {
+                _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
@@ -10966,12 +12749,16 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        unsafe {
-            (
-                _mm512_castsi512_si256(a.into()).simd_into(self),
-                _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>) -> (u32x8<Avx512>, u32x8<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
@@ -11015,7 +12802,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
-        unsafe { _mm512_cvtepu32_ps(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_cvtepu32_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
@@ -11026,20 +12819,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask32x16 {
-                val: _mm512_movepi32_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i32; 16usize]) -> mask32x16<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask32x16 {
+                    val: _mm512_movepi32_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
-        unsafe {
-            let lanes = _mm512_movm_epi32(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask32x16<Avx512>) -> [i32; 16usize] {
+                let lanes = _mm512_movm_epi32(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
@@ -11206,29 +13007,38 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let idx = _mm512_add_epi8(
-                _mm512_set_epi8(
-                    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44,
-                    43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
-                    23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
-                    1, 0,
-                ),
-                _mm512_set1_epi8((SHIFT * 8usize) as i8),
-            );
-            let result = _mm512_permutex2var_epi8(
-                self.cvt_to_bytes_f64x8(a).val.0,
-                idx,
-                self.cvt_to_bytes_f64x8(b).val.0,
-            );
-            self.cvt_from_bytes_f64x8(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
+                shift: usize,
+            ) -> f64x8<Avx512> {
+                if shift >= 8usize {
+                    return b;
+                }
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
+                    ),
+                    _mm512_set1_epi8((shift * 8usize) as i8),
+                );
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_f64x8(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_f64x8(b).val.0,
+                );
+                token.cvt_from_bytes_f64x8(u8x64 {
+                    val: crate::support::Aligned512(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
     fn slide_within_blocks_f64x8<const SHIFT: usize>(
@@ -11282,7 +13092,13 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        unsafe { _mm512_rcp14_pd(a.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_rcp14_pd(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
@@ -11341,118 +13157,170 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        unsafe {
-            mask64x8 {
-                val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        unsafe {
-            mask64x8 {
-                val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        unsafe {
-            mask64x8 {
-                val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        unsafe {
-            mask64x8 {
-                val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        unsafe {
-            mask64x8 {
-                val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_permutex2var_pd(
-                a.into(),
-                _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
+                    a.into(),
+                    _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_permutex2var_pd(
-                a.into(),
-                _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
+                    a.into(),
+                    _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_permutex2var_pd(
-                a.into(),
-                _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
+                    a.into(),
+                    _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_permutex2var_pd(
-                a.into(),
-                _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15),
-                b.into(),
-            )
-            .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
+                    a.into(),
+                    _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
-                    .simd_into(self),
-                _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
+            ) -> (f64x8<Avx512>, f64x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        unsafe {
-            let a = a.into();
-            let b = b.into();
-            (
-                _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
-                    .simd_into(self),
-                _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
-                    .simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
+            ) -> (f64x8<Avx512>, f64x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
@@ -11476,11 +13344,23 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe { _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe { _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
@@ -11514,24 +13394,36 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
@@ -11539,23 +13431,42 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
-                .simd_into(self)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        unsafe { _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(self) }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask64x8<Avx512>,
+                b: f64x8<Avx512>,
+                c: f64x8<Avx512>,
+            ) -> f64x8<Avx512> {
+                _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
     fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        unsafe {
-            (
-                _mm512_castpd512_pd256(a.into()).simd_into(self),
-                _mm512_extractf64x4_pd::<1>(a.into()).simd_into(self),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> (f64x4<Avx512>, f64x4<Avx512>) {
+                (
+                    _mm512_castpd512_pd256(a.into()).simd_into(token),
+                    _mm512_extractf64x4_pd::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
@@ -11576,20 +13487,28 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
-        unsafe {
-            let lanes = crate::transmute::checked_transmute_copy(&val);
-            mask64x8 {
-                val: _mm512_movepi64_mask(lanes),
-                simd: self,
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i64; 8usize]) -> mask64x8<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask64x8 {
+                    val: _mm512_movepi64_mask(lanes),
+                    simd: token,
+                }
             }
-        }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
     fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
-        unsafe {
-            let lanes = _mm512_movm_epi64(a.val);
-            crate::transmute::checked_transmute_copy(&lanes)
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask64x8<Avx512>) -> [i64; 8usize] {
+                let lanes = _mm512_movm_epi64(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
     fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index ca36671be..411dc4566 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -336,13 +336,13 @@ impl Level for X86 {
             OpSig::FromArray { kind }
                 if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask =>
             {
-                self.handle_avx512_mask_from_array(method_sig, vec_ty, kind)
+                self.handle_avx512_mask_from_array(op, vec_ty, kind)
             }
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind }
                 if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask =>
             {
-                self.handle_avx512_mask_as_array(method_sig, vec_ty, kind)
+                self.handle_avx512_mask_as_array(op, vec_ty, kind)
             }
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
@@ -735,7 +735,11 @@ fn avx512_mask_lane_bits(vec_ty: &VecType) -> TokenStream {
     }
 }
 
-fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
+fn avx512_mask_value_with_simd(
+    vec_ty: &VecType,
+    bits: TokenStream,
+    simd: TokenStream,
+) -> TokenStream {
     let ty = vec_ty.rust();
     let bits = if avx512_mask_register_bits(vec_ty) == 64 {
         bits
@@ -745,17 +749,25 @@ fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
     quote! {
         #ty {
             val: #bits,
-            simd: self,
+            simd: #simd,
         }
     }
 }
 
-fn avx512_mask_register_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
+fn avx512_mask_value(vec_ty: &VecType, bits: TokenStream) -> TokenStream {
+    avx512_mask_value_with_simd(vec_ty, bits, quote! { self })
+}
+
+fn avx512_mask_register_value_with_simd(
+    vec_ty: &VecType,
+    bits: TokenStream,
+    simd: TokenStream,
+) -> TokenStream {
     let ty = vec_ty.rust();
     quote! {
         #ty {
             val: #bits,
-            simd: self,
+            simd: #simd,
         }
     }
 }
@@ -954,7 +966,7 @@ impl X86 {
 
     pub(crate) fn handle_avx512_mask_from_array(
         &self,
-        method_sig: TokenStream,
+        op: Op,
         vec_ty: &VecType,
         kind: crate::ops::RefKind,
     ) -> TokenStream {
@@ -975,20 +987,22 @@ impl X86 {
         };
         // Mask arrays are specified as either 0 or -1 per lane, so the sign bit is the
         // truth value. Other lane values have unspecified results.
-        let result = avx512_mask_register_value(vec_ty, quote! { #movepi_mask(lanes) });
-        quote! {
-            #method_sig {
-                unsafe {
-                    let lanes = crate::transmute::checked_transmute_copy(#transmute_src);
-                    #result
-                }
+        self.kernel_method(op, vec_ty, |token| {
+            let result = avx512_mask_register_value_with_simd(
+                vec_ty,
+                quote! { #movepi_mask(lanes) },
+                quote! { #token },
+            );
+            quote! {
+                let lanes = crate::transmute::checked_transmute_copy(#transmute_src);
+                #result
             }
-        }
+        })
     }
 
     pub(crate) fn handle_avx512_mask_as_array(
         &self,
-        method_sig: TokenStream,
+        op: Op,
         vec_ty: &VecType,
         kind: crate::ops::RefKind,
     ) -> TokenStream {
@@ -1006,14 +1020,12 @@ impl X86 {
             op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
             vec_ty.n_bits(),
         );
-        quote! {
-            #method_sig {
-                unsafe {
-                    let lanes = #movm(a.val);
-                    crate::transmute::checked_transmute_copy(&lanes)
-                }
+        self.kernel_method(op, vec_ty, |_| {
+            quote! {
+                let lanes = #movm(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
             }
-        }
+        })
     }
 
     pub(crate) fn handle_avx512_mask_set(
@@ -1142,8 +1154,8 @@ impl X86 {
 
     pub(crate) fn handle_compare(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream {
         if *self == Self::Avx512 {
-            let method_sig = op.simd_trait_method_sig(vec_ty);
             if vec_ty.scalar == ScalarType::Mask {
+                let method_sig = op.simd_trait_method_sig(vec_ty);
                 let expr = avx512_mask_compare_expr(method, vec_ty);
                 let result = avx512_mask_value(vec_ty, expr);
                 return quote! {
@@ -1153,26 +1165,30 @@ impl X86 {
                 };
             }
 
-            let mask_ty = vec_ty.mask_ty();
-            let result = if vec_ty.scalar == ScalarType::Float {
-                let predicate = avx512_float_compare_predicate(method);
-                let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
-                let intrinsic = intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits());
-                avx512_mask_register_value(
-                    &mask_ty,
-                    quote! { #intrinsic::<#predicate>(a.into(), b.into()) },
-                )
-            } else {
-                let cmp = avx512_compare_op(method);
-                let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true);
-                let intrinsic = intrinsic_ident(cmp, &format!("{suffix}_mask"), vec_ty.n_bits());
-                avx512_mask_register_value(&mask_ty, quote! { #intrinsic(a.into(), b.into()) })
-            };
-            return quote! {
-                #method_sig {
-                    unsafe { #result }
+            return self.kernel_method(op, vec_ty, |token| {
+                let mask_ty = vec_ty.mask_ty();
+                if vec_ty.scalar == ScalarType::Float {
+                    let predicate = avx512_float_compare_predicate(method);
+                    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
+                    let intrinsic =
+                        intrinsic_ident("cmp", &format!("{suffix}_mask"), vec_ty.n_bits());
+                    avx512_mask_register_value_with_simd(
+                        &mask_ty,
+                        quote! { #intrinsic::<#predicate>(a.into(), b.into()) },
+                        quote! { #token },
+                    )
+                } else {
+                    let cmp = avx512_compare_op(method);
+                    let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true);
+                    let intrinsic =
+                        intrinsic_ident(cmp, &format!("{suffix}_mask"), vec_ty.n_bits());
+                    avx512_mask_register_value_with_simd(
+                        &mask_ty,
+                        quote! { #intrinsic(a.into(), b.into()) },
+                        quote! { #token },
+                    )
                 }
-            };
+            });
         }
 
         let args = [quote! { a.into() }, quote! { b.into() }];
@@ -1273,7 +1289,7 @@ impl X86 {
         }
 
         if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Float {
-            let body = match method {
+            match method {
                 "floor" | "ceil" | "round_ties_even" | "trunc" if vec_ty.n_bits() == 512 => {
                     let intrinsic = intrinsic_ident(
                         "roundscale",
@@ -1287,11 +1303,11 @@ impl X86 {
                         "trunc" => quote! { _MM_FROUND_TO_ZERO },
                         _ => unreachable!(),
                     };
-                    quote! {
-                        unsafe {
-                            #intrinsic::<{ #rounding_mode | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
+                    return self.kernel_method(op, vec_ty, |token| {
+                        quote! {
+                            #intrinsic::<{ #rounding_mode | _MM_FROUND_NO_EXC }>(a.into()).simd_into(#token)
                         }
-                    }
+                    });
                 }
                 "approximate_recip" => {
                     let intrinsic = intrinsic_ident(
@@ -1299,21 +1315,13 @@ impl X86 {
                         op_suffix(vec_ty.scalar, vec_ty.scalar_bits, true),
                         vec_ty.n_bits(),
                     );
-                    quote! {
-                        unsafe {
-                            #intrinsic(a.into()).simd_into(self)
+                    return self.kernel_method(op, vec_ty, |token| {
+                        quote! {
+                            #intrinsic(a.into()).simd_into(#token)
                         }
-                    }
+                    });
                 }
-                _ => TokenStream::new(),
-            };
-
-            if !body.is_empty() {
-                return quote! {
-                    #method_sig {
-                        #body
-                    }
-                };
+                _ => {}
             }
         }
 
@@ -1544,13 +1552,11 @@ impl X86 {
             } else {
                 0b0100
             };
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        #range::<#imm>(a.into(), b.into()).simd_into(self)
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    #range::<#imm>(a.into(), b.into()).simd_into(#token)
                 }
-            };
+            });
         }
 
         match method {
@@ -1559,12 +1565,9 @@ impl X86 {
                     && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned)
                     && matches!(vec_ty.scalar_bits, 8 | 16) =>
             {
-                let body = self.handle_avx512_narrow_variable_shift(method, vec_ty);
-                quote! {
-                    #method_sig {
-                        #body
-                    }
-                }
+                self.kernel_method(op, vec_ty, |token| {
+                    self.handle_avx512_narrow_variable_shift(method, vec_ty, token)
+                })
             }
             "shlv" | "shrv"
                 if !(matches!(self, Self::Avx2 | Self::Avx512) && vec_ty.scalar_bits >= 32) =>
@@ -1621,7 +1624,12 @@ impl X86 {
         }
     }
 
-    fn handle_avx512_narrow_variable_shift(&self, method: &str, vec_ty: &VecType) -> TokenStream {
+    fn handle_avx512_narrow_variable_shift(
+        &self,
+        method: &str,
+        vec_ty: &VecType,
+        token: &Ident,
+    ) -> TokenStream {
         assert!(
             *self == Self::Avx512,
             "narrow variable shifts are specialized for AVX-512"
@@ -1640,7 +1648,7 @@ impl X86 {
 
         if vec_ty.scalar_bits == 16 {
             return quote! {
-                unsafe { #shift_intrinsic(a.into(), b.into()).simd_into(self) }
+                #shift_intrinsic(a.into(), b.into()).simd_into(#token)
             };
         }
 
@@ -1664,20 +1672,18 @@ impl X86 {
         };
 
         quote! {
-            unsafe {
-                let val = a.into();
-                let counts = b.into();
-                let zero = #set0();
-                let value_extend = #value_extend;
-                let lo_values = #unpack_lo(val, value_extend);
-                let hi_values = #unpack_hi(val, value_extend);
-                let lo_counts = #unpack_lo(counts, zero);
-                let hi_counts = #unpack_hi(counts, zero);
-                let byte_mask = #set1_epi16(0x00ff);
-                let lo_shifted = #and(#shift_intrinsic(lo_values, lo_counts), byte_mask);
-                let hi_shifted = #and(#shift_intrinsic(hi_values, hi_counts), byte_mask);
-                #pack(lo_shifted, hi_shifted).simd_into(self)
-            }
+            let val = a.into();
+            let counts = b.into();
+            let zero = #set0();
+            let value_extend = #value_extend;
+            let lo_values = #unpack_lo(val, value_extend);
+            let hi_values = #unpack_hi(val, value_extend);
+            let lo_counts = #unpack_lo(counts, zero);
+            let hi_counts = #unpack_hi(counts, zero);
+            let byte_mask = #set1_epi16(0x00ff);
+            let lo_shifted = #and(#shift_intrinsic(lo_values, lo_counts), byte_mask);
+            let hi_shifted = #and(#shift_intrinsic(hi_values, hi_counts), byte_mask);
+            #pack(lo_shifted, hi_shifted).simd_into(#token)
         }
     }
 
@@ -1821,13 +1827,11 @@ impl X86 {
             }
 
             let blend = avx512_mask_blend_intrinsic(vec_ty);
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        #blend(a.val, c.into(), b.into()).simd_into(self)
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    #blend(a.val, c.into(), b.into()).simd_into(#token)
                 }
-            };
+            });
         }
 
         // Our select ops' argument order is mask, a, b; Intel's intrinsics are b, a, mask
@@ -1873,7 +1877,6 @@ impl X86 {
         }
 
         if *self == Self::Avx512 && half_ty.n_bits() == 256 {
-            let method_sig = op.simd_trait_method_sig(vec_ty);
             let (lo, hi) = match vec_ty.scalar {
                 ScalarType::Float if vec_ty.scalar_bits == 32 => (
                     quote! { _mm512_castps512_ps256(a.into()) },
@@ -1888,16 +1891,14 @@ impl X86 {
                     quote! { _mm512_extracti64x4_epi64::<1>(a.into()) },
                 ),
             };
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        (
-                            #lo.simd_into(self),
-                            #hi.simd_into(self),
-                        )
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    (
+                        #lo.simd_into(#token),
+                        #hi.simd_into(#token),
+                    )
                 }
-            };
+            });
         }
 
         if matches!(self, Self::Avx2 | Self::Avx512) && half_ty.n_bits() == 128 {
@@ -1956,13 +1957,11 @@ impl X86 {
                     _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into())
                 },
             };
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        #expr.simd_into(self)
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    #expr.simd_into(#token)
                 }
-            };
+            });
         }
 
         if matches!(self, Self::Avx2 | Self::Avx512) && combined_ty.n_bits() == 256 {
@@ -1984,7 +1983,6 @@ impl X86 {
 
     pub(crate) fn handle_zip(&self, op: Op, vec_ty: &VecType, select_low: bool) -> TokenStream {
         if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
-            let method_sig = op.simd_trait_method_sig(vec_ty);
             let offset = if select_low { 0 } else { vec_ty.len / 2 };
             let indices = (0..vec_ty.len).map(|i| {
                 let source_lane = offset + (i / 2);
@@ -1996,13 +1994,11 @@ impl X86 {
             });
             let idx = avx512_index_vector(vec_ty, indices);
             let permute = avx512_permutex2var_intrinsic(vec_ty);
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        #permute(a.into(), #idx, b.into()).simd_into(self)
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    #permute(a.into(), #idx, b.into()).simd_into(#token)
                 }
-            };
+            });
         }
 
         self.kernel_method(op, vec_ty, |token| match vec_ty.n_bits() {
@@ -2047,7 +2043,6 @@ impl X86 {
 
     pub(crate) fn handle_interleave(&self, op: Op, vec_ty: &VecType) -> TokenStream {
         if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
-            let method_sig = op.simd_trait_method_sig(vec_ty);
             let lo_indices = (0..vec_ty.len).map(|i| {
                 let source_lane = i / 2;
                 if i % 2 == 0 {
@@ -2067,18 +2062,16 @@ impl X86 {
             let lo_idx = avx512_index_vector(vec_ty, lo_indices);
             let hi_idx = avx512_index_vector(vec_ty, hi_indices);
             let permute = avx512_permutex2var_intrinsic(vec_ty);
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        let a = a.into();
-                        let b = b.into();
-                        (
-                            #permute(a, #lo_idx, b).simd_into(self),
-                            #permute(a, #hi_idx, b).simd_into(self),
-                        )
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    let a = a.into();
+                    let b = b.into();
+                    (
+                        #permute(a, #lo_idx, b).simd_into(#token),
+                        #permute(a, #hi_idx, b).simd_into(#token),
+                    )
                 }
-            };
+            });
         }
 
         match vec_ty.n_bits() {
@@ -2125,7 +2118,6 @@ impl X86 {
 
     pub(crate) fn handle_deinterleave(&self, op: Op, vec_ty: &VecType) -> TokenStream {
         if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
-            let method_sig = op.simd_trait_method_sig(vec_ty);
             let even_indices = (0..vec_ty.len).map(|i| {
                 if i < vec_ty.len / 2 {
                     i * 2
@@ -2143,18 +2135,16 @@ impl X86 {
             let even_idx = avx512_index_vector(vec_ty, even_indices);
             let odd_idx = avx512_index_vector(vec_ty, odd_indices);
             let permute = avx512_permutex2var_intrinsic(vec_ty);
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        let a = a.into();
-                        let b = b.into();
-                        (
-                            #permute(a, #even_idx, b).simd_into(self),
-                            #permute(a, #odd_idx, b).simd_into(self),
-                        )
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    let a = a.into();
+                    let b = b.into();
+                    (
+                        #permute(a, #even_idx, b).simd_into(#token),
+                        #permute(a, #odd_idx, b).simd_into(#token),
+                    )
                 }
-            };
+            });
         }
 
         match vec_ty.n_bits() {
@@ -2244,7 +2234,6 @@ impl X86 {
 
     pub(crate) fn handle_unzip(&self, op: Op, vec_ty: &VecType, select_even: bool) -> TokenStream {
         if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
-            let method_sig = op.simd_trait_method_sig(vec_ty);
             let lane_offset = if select_even { 0 } else { 1 };
             let indices = (0..vec_ty.len).map(|i| {
                 if i < vec_ty.len / 2 {
@@ -2255,13 +2244,11 @@ impl X86 {
             });
             let idx = avx512_index_vector(vec_ty, indices);
             let permute = avx512_permutex2var_intrinsic(vec_ty);
-            return quote! {
-                #method_sig {
-                    unsafe {
-                        #permute(a.into(), #idx, b.into()).simd_into(self)
-                    }
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    #permute(a.into(), #idx, b.into()).simd_into(#token)
                 }
-            };
+            });
         }
 
         self.kernel_method(op, vec_ty, |token| {
@@ -2390,32 +2377,40 @@ impl X86 {
         }
 
         if *self == Self::Avx512 && granularity == AcrossBlocks && vec_ty.n_bits() >= 256 {
+            let level = self.token();
+            let ty = vec_ty.rust();
+            let vec = quote! { #ty<#level> };
             let byte_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8);
             let base_idx = avx512_index_vector(&byte_ty, 0..byte_ty.len);
             let set_shift = set1_intrinsic(&byte_ty);
             let add = simple_sign_unaware_intrinsic("add", &byte_ty);
             let permute = avx512_permutex2var_intrinsic(&byte_ty);
             let byte_shift = if scalar_bytes == 1 {
-                quote! { SHIFT }
+                quote! { shift }
             } else {
-                quote! { SHIFT * #scalar_bytes }
+                quote! { shift * #scalar_bytes }
             };
 
             return quote! {
                 #method_sig {
-                    unsafe {
-                        if SHIFT >= #max_shift {
-                            return b;
+                    crate::kernel!(
+                        #[inline(always)]
+                        fn kernel(token: #level, a: #vec, b: #vec, shift: usize) -> #vec {
+                            if shift >= #max_shift {
+                                return b;
+                            }
+
+                            let idx = #add(#base_idx, #set_shift((#byte_shift) as i8));
+                            let result = #permute(
+                                token.#to_bytes(a).val.0,
+                                idx,
+                                token.#to_bytes(b).val.0,
+                            );
+                            token.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: token })
                         }
+                    );
 
-                        let idx = #add(#base_idx, #set_shift((#byte_shift) as i8));
-                        let result = #permute(
-                            self.#to_bytes(a).val.0,
-                            idx,
-                            self.#to_bytes(b).val.0,
-                        );
-                        self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
-                    }
+                    kernel(self, a, b, SHIFT)
                 }
             };
         }
@@ -2471,7 +2466,6 @@ impl X86 {
             vec_ty.scalar_bits, target_scalar_bits,
             "we currently only support converting between types of the same width"
         );
-        let method_sig = op.simd_trait_method_sig(vec_ty);
 
         if *self == Self::Avx512
             && vec_ty.scalar == ScalarType::Float
@@ -2479,16 +2473,17 @@ impl X86 {
         {
             let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
             let convert = intrinsic_ident("cvttps", "epu32", vec_ty.n_bits());
-            let expr = if precise {
-                let max = simple_intrinsic("max", vec_ty);
-                let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
-                let blend = avx512_mask_blend_intrinsic(&target_ty);
-                let set1_float = set1_intrinsic(vec_ty);
-                let set1_int = set1_intrinsic(&target_ty);
-                let set0_float = intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits());
-                let lt = avx512_float_compare_predicate("simd_lt");
-                quote! {
-                    unsafe {
+            return self.kernel_method(op, vec_ty, |token| {
+                if precise {
+                    let max = simple_intrinsic("max", vec_ty);
+                    let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
+                    let blend = avx512_mask_blend_intrinsic(&target_ty);
+                    let set1_float = set1_intrinsic(vec_ty);
+                    let set1_int = set1_intrinsic(&target_ty);
+                    let set0_float =
+                        intrinsic_ident("setzero", coarse_type(vec_ty), vec_ty.n_bits());
+                    let lt = avx512_float_compare_predicate("simd_lt");
+                    quote! {
                         let a = #max(a.into(), #set0_float());
                         let mut converted = #convert(a);
                         let exceeds_unsigned_range = #cmp::<#lt>(#set1_float(4294967040.0), a);
@@ -2497,27 +2492,19 @@ impl X86 {
                             converted,
                             #set1_int(u32::MAX.cast_signed()),
                         );
-                        converted.simd_into(self)
+                        converted.simd_into(#token)
                     }
-                }
-            } else {
-                quote! {
-                    unsafe {
-                        #convert(a.into()).simd_into(self)
+                } else {
+                    quote! {
+                        #convert(a.into()).simd_into(#token)
                     }
                 }
-            };
-
-            return quote! {
-                #method_sig {
-                    #expr
-                }
-            };
+            });
         }
 
         if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
             let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
-            let expr = match (vec_ty.scalar, target_scalar) {
+            return self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) {
                 (ScalarType::Float, ScalarType::Int) => {
                     let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits());
                     if precise {
@@ -2530,48 +2517,34 @@ impl X86 {
                         let lt = avx512_float_compare_predicate("simd_lt");
                         let ord = avx512_float_compare_predicate("ord");
                         quote! {
-                            unsafe {
-                                let a = a.into();
-                                let mut converted = #convert(a);
-                                let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0));
-                                converted = #blend(in_range, #set1_int(i32::MAX), converted);
-                                let is_not_nan = #cmp::<#ord>(a, a);
-                                converted = #blend(is_not_nan, #set0_int(), converted);
-                                converted.simd_into(self)
-                            }
+                            let a = a.into();
+                            let mut converted = #convert(a);
+                            let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0));
+                            converted = #blend(in_range, #set1_int(i32::MAX), converted);
+                            let is_not_nan = #cmp::<#ord>(a, a);
+                            converted = #blend(is_not_nan, #set0_int(), converted);
+                            converted.simd_into(#token)
                         }
                     } else {
                         quote! {
-                            unsafe {
-                                #convert(a.into()).simd_into(self)
-                            }
+                            #convert(a.into()).simd_into(#token)
                         }
                     }
                 }
                 (ScalarType::Int, ScalarType::Float) => {
                     let intrinsic = simple_intrinsic("cvtepi32", &target_ty);
                     quote! {
-                        unsafe {
-                            #intrinsic(a.into()).simd_into(self)
-                        }
+                        #intrinsic(a.into()).simd_into(#token)
                     }
                 }
                 (ScalarType::Unsigned, ScalarType::Float) => {
                     let intrinsic = simple_intrinsic("cvtepu32", &target_ty);
                     quote! {
-                        unsafe {
-                            #intrinsic(a.into()).simd_into(self)
-                        }
+                        #intrinsic(a.into()).simd_into(#token)
                     }
                 }
                 _ => unimplemented!(),
-            };
-
-            return quote! {
-                #method_sig {
-                    #expr
-                }
-            };
+            });
         }
 
         self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) {

From 70e489bdec16c2e4b879871f35af141410072826 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 21 Jun 2026 01:46:57 +0100
Subject: [PATCH 44/55] Optimize u32->f32 conversion for 128-bit and 256-bit
 vectors on AVX-512

---
 fearless_simd/src/generated/avx512.rs         | 26 +++----------------
 fearless_simd_gen/src/mk_x86.rs               | 21 +++++++++++++++
 .../tests/harness/lm_generated/mod_256.rs     | 16 ++++++++++++
 3 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 2f73c5fc5..987387f3c 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -3210,16 +3210,8 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u32x4<Avx512>) -> f32x4<Avx512> {
-                let a = a.into();
-                let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
-                let hi =
-                    _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
-                let fhi = _mm_sub_ps(
-                    _mm_castsi128_ps(hi),
-                    _mm_set1_ps(f32::from_bits(0x53000080)),
-                );
-                let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
-                result.simd_into(token)
+                _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_zextsi128_si512(a.into())))
+                    .simd_into(token)
             }
         );
         kernel(self, a)
@@ -7858,18 +7850,8 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u32x8<Avx512>) -> f32x8<Avx512> {
-                let a = a.into();
-                let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000));
-                let hi = _mm256_blend_epi16::<0xAA>(
-                    _mm256_srli_epi32::<16>(a),
-                    _mm256_set1_epi32(0x53000000),
-                );
-                let fhi = _mm256_sub_ps(
-                    _mm256_castsi256_ps(hi),
-                    _mm256_set1_ps(f32::from_bits(0x53000080)),
-                );
-                let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi);
-                result.simd_into(token)
+                _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_zextsi256_si512(a.into())))
+                    .simd_into(token)
             }
         );
         kernel(self, a)
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 411dc4566..9b6cca790 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -2502,6 +2502,27 @@ impl X86 {
             });
         }
 
+        if *self == Self::Avx512
+            && matches!(vec_ty.n_bits(), 128 | 256)
+            && vec_ty.scalar == ScalarType::Unsigned
+            && target_scalar == ScalarType::Float
+            && vec_ty.scalar_bits == 32
+        {
+            // We cannot emit the intrinsics for the conversion instructions
+            // because the required intrinsics are mysteriously absent from stdarch:
+            // https://github.com/rust-lang/rust/issues/158196
+            // Fortunately LLVM optimizes this sequence into the single instruction we're after.
+            let bits = vec_ty.n_bits();
+            let zext = format_ident!("_mm512_zextsi{bits}_si512");
+            let convert = intrinsic_ident("cvtepu32", "ps", 512);
+            let cast = format_ident!("_mm512_castps512_ps{bits}");
+            return self.kernel_method(op, vec_ty, |token| {
+                quote! {
+                    #cast(#convert(#zext(a.into()))).simd_into(#token)
+                }
+            });
+        }
+
         if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
             let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
             return self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) {
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
index e82ac078e..459c0bd2b 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
@@ -404,6 +404,22 @@ fn cvt_u32_f32x8_rounding<S: Simd>(simd: S) {
     assert_eq!(*a.to_int::<u32x8<_>>(), [0, 0, 0, 0, 1, 1, 2, 3]);
 }
 
+#[simd_test]
+fn cvt_f32_u32x8<S: Simd>(simd: S) {
+    let values = [
+        0,
+        42,
+        1_000_000,
+        i32::MAX as u32,
+        0x8000_0000,
+        0xffff_ff00,
+        u32::MAX - 1,
+        u32::MAX,
+    ];
+    let a = u32x8::from_slice(simd, &values);
+    assert_eq!(*a.to_float::<f32x8<_>>(), values.map(|x| x as f32));
+}
+
 #[simd_test]
 fn cvt_u32_precise_f32x8_inf<S: Simd>(simd: S) {
     let a = f32x8::from_slice(

From a5f1b3ae1de8195c6e3545148fca7a8fe415d368 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 21 Jun 2026 02:09:15 +0100
Subject: [PATCH 45/55] Optimize precise i32 to f32 conversions on AVX-512 for
 vector sizes less than 512

---
 fearless_simd/src/generated/avx512.rs         | 38 ++++--------
 fearless_simd_gen/src/mk_x86.rs               | 60 +++++++++++--------
 .../tests/harness/lm_generated/mod_256.rs     | 21 +++++++
 3 files changed, 66 insertions(+), 53 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 987387f3c..6d52b5ca9 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -614,18 +614,10 @@ impl Simd for Avx512 {
             #[inline(always)]
             fn kernel(token: Avx512, a: f32x4<Avx512>) -> i32x4<Avx512> {
                 let a = a.into();
-                let mut converted = _mm_cvttps_epi32(a);
-                let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
-                let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
-                if !all_in_range {
-                    converted = _mm_blendv_epi8(
-                        _mm_set1_epi32(i32::MAX),
-                        converted,
-                        _mm_castps_si128(in_range),
-                    );
-                    let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
-                    converted = _mm_and_si128(converted, is_not_nan);
-                }
+                let in_range = _mm_cmp_ps_mask::<17i32>(a, _mm_set1_ps(2147483648.0));
+                let mut converted = _mm_mask_cvttps_epi32(_mm_set1_epi32(i32::MAX), in_range, a);
+                let is_not_nan = _mm_cmp_ps_mask::<7i32>(a, a);
+                converted = _mm_mask_blend_epi32(is_not_nan, _mm_setzero_si128(), converted);
                 converted.simd_into(token)
             }
         );
@@ -4532,18 +4524,11 @@ impl Simd for Avx512 {
             #[inline(always)]
             fn kernel(token: Avx512, a: f32x8<Avx512>) -> i32x8<Avx512> {
                 let a = a.into();
-                let mut converted = _mm256_cvttps_epi32(a);
-                let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
-                let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
-                if !all_in_range {
-                    converted = _mm256_blendv_epi8(
-                        _mm256_set1_epi32(i32::MAX),
-                        converted,
-                        _mm256_castps_si256(in_range),
-                    );
-                    let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a));
-                    converted = _mm256_and_si256(converted, is_not_nan);
-                }
+                let in_range = _mm256_cmp_ps_mask::<17i32>(a, _mm256_set1_ps(2147483648.0));
+                let mut converted =
+                    _mm256_mask_cvttps_epi32(_mm256_set1_epi32(i32::MAX), in_range, a);
+                let is_not_nan = _mm256_cmp_ps_mask::<7i32>(a, a);
+                converted = _mm256_mask_blend_epi32(is_not_nan, _mm256_setzero_si256(), converted);
                 converted.simd_into(token)
             }
         );
@@ -9323,10 +9308,9 @@ impl Simd for Avx512 {
             #[inline(always)]
             fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
                 let a = a.into();
-                let mut converted = _mm512_cvttps_epi32(a);
                 let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0));
-                converted =
-                    _mm512_mask_blend_epi32(in_range, _mm512_set1_epi32(i32::MAX), converted);
+                let mut converted =
+                    _mm512_mask_cvttps_epi32(_mm512_set1_epi32(i32::MAX), in_range, a);
                 let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a);
                 converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted);
                 converted.simd_into(token)
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 9b6cca790..64841fea1 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -2502,6 +2502,40 @@ impl X86 {
             });
         }
 
+        if *self == Self::Avx512
+            && vec_ty.scalar == ScalarType::Float
+            && target_scalar == ScalarType::Int
+            && vec_ty.scalar_bits == 32
+        {
+            let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
+            let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits());
+            return self.kernel_method(op, vec_ty, |token| {
+                if precise {
+                    let masked_convert = intrinsic_ident("mask_cvttps", "epi32", vec_ty.n_bits());
+                    let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
+                    let blend = avx512_mask_blend_intrinsic(&target_ty);
+                    let set1_float = set1_intrinsic(vec_ty);
+                    let set1_int = set1_intrinsic(&target_ty);
+                    let set0_int =
+                        intrinsic_ident("setzero", coarse_type(&target_ty), target_ty.n_bits());
+                    let lt = avx512_float_compare_predicate("simd_lt");
+                    let ord = avx512_float_compare_predicate("ord");
+                    quote! {
+                        let a = a.into();
+                        let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0));
+                        let mut converted = #masked_convert(#set1_int(i32::MAX), in_range, a);
+                        let is_not_nan = #cmp::<#ord>(a, a);
+                        converted = #blend(is_not_nan, #set0_int(), converted);
+                        converted.simd_into(#token)
+                    }
+                } else {
+                    quote! {
+                        #convert(a.into()).simd_into(#token)
+                    }
+                }
+            });
+        }
+
         if *self == Self::Avx512
             && matches!(vec_ty.n_bits(), 128 | 256)
             && vec_ty.scalar == ScalarType::Unsigned
@@ -2526,32 +2560,6 @@ impl X86 {
         if *self == Self::Avx512 && vec_ty.n_bits() == 512 {
             let target_ty = vec_ty.reinterpret(target_scalar, target_scalar_bits);
             return self.kernel_method(op, vec_ty, |token| match (vec_ty.scalar, target_scalar) {
-                (ScalarType::Float, ScalarType::Int) => {
-                    let convert = intrinsic_ident("cvttps", "epi32", vec_ty.n_bits());
-                    if precise {
-                        let cmp = intrinsic_ident("cmp", "ps_mask", vec_ty.n_bits());
-                        let blend = avx512_mask_blend_intrinsic(&target_ty);
-                        let set1_float = set1_intrinsic(vec_ty);
-                        let set1_int = set1_intrinsic(&target_ty);
-                        let set0_int =
-                            intrinsic_ident("setzero", coarse_type(&target_ty), target_ty.n_bits());
-                        let lt = avx512_float_compare_predicate("simd_lt");
-                        let ord = avx512_float_compare_predicate("ord");
-                        quote! {
-                            let a = a.into();
-                            let mut converted = #convert(a);
-                            let in_range = #cmp::<#lt>(a, #set1_float(2147483648.0));
-                            converted = #blend(in_range, #set1_int(i32::MAX), converted);
-                            let is_not_nan = #cmp::<#ord>(a, a);
-                            converted = #blend(is_not_nan, #set0_int(), converted);
-                            converted.simd_into(#token)
-                        }
-                    } else {
-                        quote! {
-                            #convert(a.into()).simd_into(#token)
-                        }
-                    }
-                }
                 (ScalarType::Int, ScalarType::Float) => {
                     let intrinsic = simple_intrinsic("cvtepi32", &target_ty);
                     quote! {
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
index 459c0bd2b..f9736013c 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mod_256.rs
@@ -398,6 +398,27 @@ fn cvt_u32_precise_f32x8<S: Simd>(simd: S) {
     );
 }
 
+#[simd_test]
+fn cvt_i32_precise_f32x8<S: Simd>(simd: S) {
+    let a = f32x8::from_slice(
+        simd,
+        &[
+            -10.3,
+            f32::NAN,
+            5e9,
+            -5e9,
+            f32::INFINITY,
+            f32::NEG_INFINITY,
+            42.7,
+            -0.9,
+        ],
+    );
+    assert_eq!(
+        *a.to_int_precise::<i32x8<_>>(),
+        [-10, 0, i32::MAX, i32::MIN, i32::MAX, i32::MIN, 42, 0]
+    );
+}
+
 #[simd_test]
 fn cvt_u32_f32x8_rounding<S: Simd>(simd: S) {
     let a = f32x8::from_slice(simd, &[0.0, 0.49, 0.51, 0.99, 1.01, 1.99, 2.5, 3.75]);

From 490f83bb893fe8b2ea45a8f9096869289c54f267 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 21 Jun 2026 02:22:56 +0100
Subject: [PATCH 46/55] Optimize 128-bit unzip and deinterleave on AVX-512

---
 fearless_simd/src/generated/avx512.rs | 226 ++++++++++++++++++++------
 fearless_simd_gen/src/mk_x86.rs       |  11 +-
 2 files changed, 185 insertions(+), 52 deletions(-)

diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 6d52b5ca9..976ebd2ad 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -936,10 +936,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> i8x16<Avx512> {
-                let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpacklo_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi8(
+                    a.into(),
+                    _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -949,10 +951,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: i8x16<Avx512>, b: i8x16<Avx512>) -> i8x16<Avx512> {
-                let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpackhi_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi8(
+                    a.into(),
+                    _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -963,7 +967,32 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn deinterleave_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> (i8x16<Self>, i8x16<Self>) {
-        (self.unzip_low_i8x16(a, b), self.unzip_high_i8x16(a, b))
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x16<Avx512>,
+                b: i8x16<Avx512>,
+            ) -> (i8x16<Avx512>, i8x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm_permutex2var_epi8(
+                        a,
+                        _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm_permutex2var_epi8(
+                        a,
+                        _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
@@ -1353,10 +1382,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> u8x16<Avx512> {
-                let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpacklo_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi8(
+                    a.into(),
+                    _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -1366,10 +1397,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u8x16<Avx512>, b: u8x16<Avx512>) -> u8x16<Avx512> {
-                let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpackhi_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi8(
+                    a.into(),
+                    _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -1380,7 +1413,32 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn deinterleave_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> (u8x16<Self>, u8x16<Self>) {
-        (self.unzip_low_u8x16(a, b), self.unzip_high_u8x16(a, b))
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x16<Avx512>,
+                b: u8x16<Avx512>,
+            ) -> (u8x16<Avx512>, u8x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm_permutex2var_epi8(
+                        a,
+                        _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm_permutex2var_epi8(
+                        a,
+                        _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
@@ -1854,10 +1912,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> i16x8<Avx512> {
-                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpacklo_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi16(
+                    a.into(),
+                    _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -1867,10 +1927,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: i16x8<Avx512>, b: i16x8<Avx512>) -> i16x8<Avx512> {
-                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpackhi_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi16(
+                    a.into(),
+                    _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -1881,7 +1943,24 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn deinterleave_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> (i16x8<Self>, i16x8<Self>) {
-        (self.unzip_low_i16x8(a, b), self.unzip_high_i16x8(a, b))
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x8<Avx512>,
+                b: i16x8<Avx512>,
+            ) -> (i16x8<Avx512>, i16x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm_permutex2var_epi16(a, _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm_permutex2var_epi16(a, _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
@@ -2230,10 +2309,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> u16x8<Avx512> {
-                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpacklo_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi16(
+                    a.into(),
+                    _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -2243,10 +2324,12 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u16x8<Avx512>, b: u16x8<Avx512>) -> u16x8<Avx512> {
-                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-                let t1 = _mm_shuffle_epi8(a.into(), mask);
-                let t2 = _mm_shuffle_epi8(b.into(), mask);
-                _mm_unpackhi_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi16(
+                    a.into(),
+                    _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -2257,7 +2340,24 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn deinterleave_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> (u16x8<Self>, u16x8<Self>) {
-        (self.unzip_low_u16x8(a, b), self.unzip_high_u16x8(a, b))
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x8<Avx512>,
+                b: u16x8<Avx512>,
+            ) -> (u16x8<Avx512>, u16x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm_permutex2var_epi16(a, _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm_permutex2var_epi16(a, _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
@@ -2731,9 +2831,8 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: i32x4<Avx512>, b: i32x4<Avx512>) -> i32x4<Avx512> {
-                let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
-                let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
-                _mm_unpacklo_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(0, 2, 4, 6), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -2743,9 +2842,8 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: i32x4<Avx512>, b: i32x4<Avx512>) -> i32x4<Avx512> {
-                let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
-                let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
-                _mm_unpackhi_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(1, 3, 5, 7), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -2756,7 +2854,22 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn deinterleave_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> (i32x4<Self>, i32x4<Self>) {
-        (self.unzip_low_i32x4(a, b), self.unzip_high_i32x4(a, b))
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x4<Avx512>,
+                b: i32x4<Avx512>,
+            ) -> (i32x4<Avx512>, i32x4<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm_permutex2var_epi32(a, _mm_setr_epi32(0, 2, 4, 6), b).simd_into(token),
+                    _mm_permutex2var_epi32(a, _mm_setr_epi32(1, 3, 5, 7), b).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
@@ -3115,9 +3228,8 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u32x4<Avx512>, b: u32x4<Avx512>) -> u32x4<Avx512> {
-                let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
-                let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
-                _mm_unpacklo_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(0, 2, 4, 6), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -3127,9 +3239,8 @@ impl Simd for Avx512 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx512, a: u32x4<Avx512>, b: u32x4<Avx512>) -> u32x4<Avx512> {
-                let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
-                let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
-                _mm_unpackhi_epi64(t1, t2).simd_into(token)
+                _mm_permutex2var_epi32(a.into(), _mm_setr_epi32(1, 3, 5, 7), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
@@ -3140,7 +3251,22 @@ impl Simd for Avx512 {
     }
     #[inline(always)]
     fn deinterleave_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> (u32x4<Self>, u32x4<Self>) {
-        (self.unzip_low_u32x4(a, b), self.unzip_high_u32x4(a, b))
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u32x4<Avx512>,
+                b: u32x4<Avx512>,
+            ) -> (u32x4<Avx512>, u32x4<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm_permutex2var_epi32(a, _mm_setr_epi32(0, 2, 4, 6), b).simd_into(token),
+                    _mm_permutex2var_epi32(a, _mm_setr_epi32(1, 3, 5, 7), b).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
     fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 64841fea1..08ee3ac51 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -813,6 +813,13 @@ fn avx512_permutex2var_intrinsic(vec_ty: &VecType) -> Ident {
     intrinsic_ident("permutex2var", suffix, vec_ty.n_bits())
 }
 
+fn avx512_should_use_unzip_permutex2var(vec_ty: &VecType) -> bool {
+    vec_ty.scalar != ScalarType::Mask
+        && (vec_ty.n_bits() >= 256
+            || (vec_ty.n_bits() == 128
+                && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned)))
+}
+
 fn avx512_permutexvar_intrinsic(vec_ty: &VecType) -> Ident {
     let suffix = op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false);
     intrinsic_ident("permutexvar", suffix, vec_ty.n_bits())
@@ -2117,7 +2124,7 @@ impl X86 {
     }
 
     pub(crate) fn handle_deinterleave(&self, op: Op, vec_ty: &VecType) -> TokenStream {
-        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+        if *self == Self::Avx512 && avx512_should_use_unzip_permutex2var(vec_ty) {
             let even_indices = (0..vec_ty.len).map(|i| {
                 if i < vec_ty.len / 2 {
                     i * 2
@@ -2233,7 +2240,7 @@ impl X86 {
     }
 
     pub(crate) fn handle_unzip(&self, op: Op, vec_ty: &VecType, select_even: bool) -> TokenStream {
-        if *self == Self::Avx512 && vec_ty.scalar != ScalarType::Mask && vec_ty.n_bits() >= 256 {
+        if *self == Self::Avx512 && avx512_should_use_unzip_permutex2var(vec_ty) {
             let lane_offset = if select_even { 0 } else { 1 };
             let indices = (0..vec_ty.len).map(|i| {
                 if i < vec_ty.len / 2 {

From 6d5f4ed7793492a5b0c0b7071229d1d6d5b67a7c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 11:42:49 +0100
Subject: [PATCH 47/55] Document AVX-512 support in the README

---
 fearless_simd/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs
index de39e0cee..68ca3d0e4 100644
--- a/fearless_simd/src/lib.rs
+++ b/fearless_simd/src/lib.rs
@@ -114,7 +114,7 @@
 //!
 //! # Instruction set support
 //!
-//! - x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2)
+//! - x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2), [Ice Lake](https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512) (AVX-512, avoiding early slow implementations)
 //! - Aarch64: Baseline [NEON](https://en.wikipedia.org/wiki/Arm_architecture_family#Advanced_SIMD_(Neon))
 //! - WebAssembly: [128-bit packed SIMD](https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md), [relaxed SIMD](https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/Overview.md)
 //!

From cf18ec3159fd5c02479c02f89759593277b1090c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 12:56:15 +0100
Subject: [PATCH 48/55] Regenerate README

---
 fearless_simd/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fearless_simd/README.md b/fearless_simd/README.md
index ee8c5b8a1..ffba6b0fd 100644
--- a/fearless_simd/README.md
+++ b/fearless_simd/README.md
@@ -146,7 +146,7 @@ case. There's also Q&A on [Zulip](https://xi.zulipchat.com/#narrow/channel/51423
 
 ## Instruction set support
 
-- x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2)
+- x86/x86-64: [v2](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (SSE4.2), [v3](https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels) (AVX2), [Ice Lake](https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512) (AVX-512, avoiding early slow implementations)
 - Aarch64: Baseline [NEON](https://en.wikipedia.org/wiki/Arm_architecture_family#Advanced_SIMD_(Neon))
 - WebAssembly: [128-bit packed SIMD](https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md), [relaxed SIMD](https://github.com/WebAssembly/relaxed-simd/blob/main/proposals/relaxed-simd/Overview.md)
 

From 8dc0938e3c1e79236ae1c196cd5534c4f57969d2 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 14:01:20 +0100
Subject: [PATCH 49/55] Add 64-bit integer vectors and operations on them

---
 fearless_simd/src/generated/avx2.rs           | 10886 +++++++++------
 fearless_simd/src/generated/avx512.rs         | 11459 ++++++++++------
 fearless_simd/src/generated/fallback.rs       |  1632 ++-
 fearless_simd/src/generated/neon.rs           |  9442 ++++++++-----
 fearless_simd/src/generated/ops.rs            |  5778 +++++---
 fearless_simd/src/generated/simd_trait.rs     |   542 +-
 fearless_simd/src/generated/simd_types.rs     |  1699 ++-
 fearless_simd/src/generated/sse4_2.rs         | 10032 ++++++++------
 fearless_simd/src/generated/wasm.rs           |  9328 ++++++++-----
 fearless_simd/src/traits.rs                   |     5 +
 fearless_simd/src/transmute.rs                |    10 +-
 fearless_simd_gen/src/arch/neon.rs            |    15 +-
 fearless_simd_gen/src/arch/x86.rs             |     7 +-
 fearless_simd_gen/src/generic.rs              |    83 +-
 fearless_simd_gen/src/level.rs                |     2 +
 fearless_simd_gen/src/mk_fallback.rs          |    16 +-
 fearless_simd_gen/src/mk_neon.rs              |    14 +-
 fearless_simd_gen/src/mk_simd_trait.rs        |     9 +-
 fearless_simd_gen/src/mk_simd_types.rs        |     7 +-
 fearless_simd_gen/src/mk_wasm.rs              |    68 +-
 fearless_simd_gen/src/mk_x86.rs               |   191 +-
 fearless_simd_gen/src/types.rs                |     6 +
 fearless_simd_tests/tests/harness/int64.rs    |  1464 ++
 fearless_simd_tests/tests/harness/mod.rs      |     1 +
 .../tests/harness/slide_exhaustive.rs         |     6 +
 25 files changed, 40714 insertions(+), 21988 deletions(-)
 create mode 100644 fearless_simd_tests/tests/harness/int64.rs

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 3f4ee93e7..e9db0d6c3 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -6,9 +6,9 @@
 use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
@@ -45,6 +45,8 @@ impl ArchTypes for Avx2 {
     type u32x4 = crate::support::Aligned128<__m128i>;
     type mask32x4 = crate::support::Aligned128<__m128i>;
     type f64x2 = crate::support::Aligned128<__m128d>;
+    type i64x2 = crate::support::Aligned128<__m128i>;
+    type u64x2 = crate::support::Aligned128<__m128i>;
     type mask64x2 = crate::support::Aligned128<__m128i>;
     type f32x8 = crate::support::Aligned256<__m256>;
     type i8x32 = crate::support::Aligned256<__m256i>;
@@ -57,6 +59,8 @@ impl ArchTypes for Avx2 {
     type u32x8 = crate::support::Aligned256<__m256i>;
     type mask32x8 = crate::support::Aligned256<__m256i>;
     type f64x4 = crate::support::Aligned256<__m256d>;
+    type i64x4 = crate::support::Aligned256<__m256i>;
+    type u64x4 = crate::support::Aligned256<__m256i>;
     type mask64x4 = crate::support::Aligned256<__m256i>;
     type f32x16 = crate::support::Aligned512<[__m256; 2usize]>;
     type i8x64 = crate::support::Aligned512<[__m256i; 2usize]>;
@@ -69,6 +73,8 @@ impl ArchTypes for Avx2 {
     type u32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
     type mask32x16 = crate::support::Aligned512<[__m256i; 2usize]>;
     type f64x8 = crate::support::Aligned512<[__m256d; 2usize]>;
+    type i64x8 = crate::support::Aligned512<[__m256i; 2usize]>;
+    type u64x8 = crate::support::Aligned512<[__m256i; 2usize]>;
     type mask64x8 = crate::support::Aligned512<[__m256i; 2usize]>;
 }
 impl Simd for Avx2 {
@@ -80,6 +86,8 @@ impl Simd for Avx2 {
     type i16s = i16x16<Self>;
     type u32s = u32x8<Self>;
     type i32s = i32x8<Self>;
+    type u64s = u64x4<Self>;
+    type i64s = i64x4<Self>;
     type mask8s = mask8x32<Self>;
     type mask16s = mask16x16<Self>;
     type mask32s = mask32x8<Self>;
@@ -785,7 +793,27 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i8; 16usize] = a.into();
+        let b: [i8; 16usize] = b.into();
+        let result: [i8; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
@@ -805,7 +833,27 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i8; 16usize] = a.into();
+        let b: [i8; 16usize] = b.into();
+        let result: [i8; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
@@ -1153,7 +1201,27 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u8; 16usize] = a.into();
+        let b: [u8; 16usize] = b.into();
+        let result: [u8; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
@@ -1173,7 +1241,27 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u8; 16usize] = a.into();
+        let b: [u8; 16usize] = b.into();
+        let result: [u8; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
@@ -1681,7 +1769,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i16; 8usize] = a.into();
+        let b: [i16; 8usize] = b.into();
+        let result: [i16; 8usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
@@ -1695,7 +1795,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i16; 8usize] = a.into();
+        let b: [i16; 8usize] = b.into();
+        let result: [i16; 8usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
@@ -2030,7 +2142,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u16; 8usize] = a.into();
+        let b: [u16; 8usize] = b.into();
+        let result: [u16; 8usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
@@ -2044,7 +2168,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u16; 8usize] = a.into();
+        let b: [u16; 8usize] = b.into();
+        let result: [u16; 8usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
@@ -3711,1333 +3847,1692 @@ impl Simd for Avx2 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+    fn splat_i64x2(self, val: i64) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: bool) -> mask64x2<Avx2> {
-                let val: i64 = if val { !0 } else { 0 };
+            fn kernel(token: Avx2, val: i64) -> i64x2<Avx2> {
                 _mm_set1_epi64x(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
-        mask64x2 {
+    fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+    fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x2(self, a: i64x2<Self>) -> [i64; 2usize] {
         crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+    fn as_array_ref_i64x2(self, a: &i64x2<Self>) -> &[i64; 2usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [i64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i64x2(self, a: &mut i64x2<Self>) -> &mut [i64; 2usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [i64; 2usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i64x2(self, a: i64x2<Self>, dest: &mut [i64; 2usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x2(self, a: u8x16<Self>) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x2<const SHIFT: usize>(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i64x2(b).val.0,
+            self.cvt_to_bytes_i64x2(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x2<const SHIFT: usize>(
+        self,
+        a: i64x2<Self>,
+        b: i64x2<Self>,
+    ) -> i64x2<Self> {
+        self.slide_i64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, bits: u64) -> mask64x2<Avx2> {
-                {
-                    let bit_lanes = _mm_set1_epi64x(bits.cast_signed());
-                    let bit_mask = _mm_set_epi64x(2, 1);
-                    _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
-                }
-                .simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, bits)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+    fn sub_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> u64 {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 2usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            2usize
-        );
-        let mut lanes = self.as_array_mask64x2(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x2(lanes);
+    fn mul_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [
+            a[0usize].wrapping_mul(b[0usize]),
+            a[1usize].wrapping_mul(b[1usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn and_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
                 _mm_and_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn or_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
                 _mm_or_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn xor_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
                 _mm_xor_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
-        self.xor_mask64x2(a, self.splat_mask64x2(true))
+    fn not_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
+        a ^ !0
     }
     #[inline(always)]
-    fn select_mask64x2(
-        self,
-        a: mask64x2<Self>,
-        b: mask64x2<Self>,
-        c: mask64x2<Self>,
-    ) -> mask64x2<Self> {
+    fn shl_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask64x2<Avx2>,
-                b: mask64x2<Avx2>,
-                c: mask64x2<Avx2>,
-            ) -> mask64x2<Avx2> {
-                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>, shift: u32) -> i64x2<Avx2> {
+                _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn shlv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
-                _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn shr_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let result: [i64; 2usize] = [
+            core::ops::Shr::shr(a[0usize], shift),
+            core::ops::Shr::shr(a[1usize], shift),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn shrv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> mask64x2<Avx2> {
+                _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn simd_lt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] < b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] < b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn simd_le_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] <= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] <= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn simd_ge_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] >= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] >= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
+    fn simd_gt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x4<Avx2> {
-                _mm256_setr_m128i(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] > b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] > b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+    fn zip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: f32) -> f32x8<Avx2> {
-                _mm256_set1_ps(val).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, val)
-    }
-    #[inline(always)]
-    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0)
+    fn zip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
-        crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0)
+    fn unzip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
-        crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0)
+    fn unzip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn interleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b))
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
-        f32x8 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn deinterleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b))
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn select_i64x2(self, a: mask64x2<Self>, b: i64x2<Self>, c: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx2,
+                a: mask64x2<Avx2>,
+                b: i64x2<Avx2>,
+                c: i64x2<Avx2>,
+            ) -> i64x2<Avx2> {
+                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        if SHIFT >= 8usize {
-            return b;
-        }
-        let result = cross_block_alignr_256x1(
-            self,
-            self.cvt_to_bytes_f32x8(b).val.0,
-            self.cvt_to_bytes_f32x8(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_f32x8(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn min_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x8<const SHIFT: usize>(
-        self,
-        a: f32x8<Self>,
-        b: f32x8<Self>,
-    ) -> f32x8<Self> {
-        if SHIFT >= 4usize {
-            return b;
-        }
-        let result = dyn_alignr_256(
-            self,
-            self.cvt_to_bytes_f32x8(b).val.0,
-            self.cvt_to_bytes_f32x8(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_f32x8(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn max_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn combine_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>, b: i64x2<Avx2>) -> i64x4<Avx2> {
+                _mm256_setr_m128i(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn neg_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>) -> i64x2<Avx2> {
+                _mm_sub_epi64(_mm_setzero_si128(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_u8_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_sqrt_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>) -> u8x16<Avx2> {
+                __m128i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_u32_i64x2(self, a: i64x2<Self>) -> u32x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_rcp_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i64x2<Avx2>) -> u32x4<Avx2> {
+                __m128i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn splat_u64x2(self, val: u64) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_add_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, val: u64) -> u64x2<Avx2> {
+                _mm_set1_epi64x(val.cast_signed()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, val)
     }
     #[inline(always)]
-    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_sub_ps(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_mul_ps(a.into(), b.into()).simd_into(token)
-            }
+    fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u64x2(self, a: u64x2<Self>) -> [u64; 2usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [u64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u64x2(self, a: &u64x2<Self>) -> &[u64; 2usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [u64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u64x2(self, a: &mut u64x2<Self>) -> &mut [u64; 2usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [u64; 2usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u64x2(self, a: u64x2<Self>, dest: &mut [u64; 2usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u64x2(self, a: u8x16<Self>) -> u64x2<Self> {
+        u64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u64x2<const SHIFT: usize>(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u64x2(b).val.0,
+            self.cvt_to_bytes_u64x2(a).val.0,
+            SHIFT * 8usize,
         );
-        kernel(self, a, b)
+        self.cvt_from_bytes_u64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn slide_within_blocks_u64x2<const SHIFT: usize>(
+        self,
+        a: u64x2<Self>,
+        b: u64x2<Self>,
+    ) -> u64x2<Self> {
+        self.slide_u64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_div_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn sub_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                let mask = _mm256_set1_ps(-0.0);
-                _mm256_or_ps(
-                    _mm256_and_ps(mask, b.into()),
-                    _mm256_andnot_ps(mask, a.into()),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn mul_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [
+            a[0usize].wrapping_mul(b[0usize]),
+            a[1usize].wrapping_mul(b[1usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn and_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_and_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn or_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_or_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn xor_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_xor_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn not_u64x2(self, a: u64x2<Self>) -> u64x2<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, shift: u32) -> u64x2<Avx2> {
+                _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn shlv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn shr_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                let lo = _mm256_unpacklo_ps(a.into(), b.into());
-                let hi = _mm256_unpackhi_ps(a.into(), b.into());
-                _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, shift: u32) -> u64x2<Avx2> {
+                _mm_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn shrv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                let lo = _mm256_unpacklo_ps(a.into(), b.into());
-                let hi = _mm256_unpackhi_ps(a.into(), b.into());
-                _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_srlv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn simd_eq_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                let t1 =
-                    _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
-                let t2 =
-                    _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
-                _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> mask64x2<Avx2> {
+                _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn simd_lt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                let t1 =
-                    _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
-                let t2 =
-                    _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
-                _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] < b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] < b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+    fn simd_le_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> (f32x8<Avx2>, f32x8<Avx2>) {
-                let lo = _mm256_unpacklo_ps(a.into(), b.into());
-                let hi = _mm256_unpackhi_ps(a.into(), b.into());
-                (
-                    _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token),
-                    _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] <= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] <= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+    fn simd_ge_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> (f32x8<Avx2>, f32x8<Avx2>) {
-                let t1 =
-                    _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
-                let t2 =
-                    _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
-                (
-                    _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token),
-                    _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] >= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] >= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn simd_gt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_max_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> mask64x2<Avx2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] > b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] > b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn zip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_min_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn zip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                let intermediate = _mm256_max_ps(a.into(), b.into());
-                let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
-                _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn unzip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
-                let intermediate = _mm256_min_ps(a.into(), b.into());
-                let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
-                _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+    fn unzip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>, c: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x2<Avx2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+    fn interleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b))
+    }
+    #[inline(always)]
+    fn select_u64x2(self, a: mask64x2<Self>, b: u64x2<Self>, c: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>, c: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(
+                token: Avx2,
+                a: mask64x2<Avx2>,
+                b: u64x2<Avx2>,
+                c: u64x2<Avx2>,
+            ) -> u64x2<Avx2> {
+                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn min_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn max_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn combine_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>, b: u64x2<Avx2>) -> u64x4<Avx2> {
+                _mm256_setr_m128i(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_u8_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>) -> u8x16<Avx2> {
+                __m128i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        a - self.trunc_f32x8(a)
-    }
-    #[inline(always)]
-    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_u32_u64x2(self, a: u64x2<Self>) -> u32x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx2, a: u64x2<Avx2>) -> u32x4<Avx2> {
+                __m128i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask32x8<Avx2>,
-                b: f32x8<Avx2>,
-                c: f32x8<Avx2>,
-            ) -> f32x8<Avx2> {
-                _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(token)
+            fn kernel(token: Avx2, val: bool) -> mask64x2<Avx2> {
+                let val: i64 = if val { !0 } else { 0 };
+                _mm_set1_epi64x(val).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, val)
     }
     #[inline(always)]
-    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
-        f32x16 {
-            val: crate::support::Aligned512([a.val.0, b.val.0]),
+    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+        mask64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> (f32x4<Avx2>, f32x4<Avx2>) {
-                (
-                    _mm256_extractf128_ps::<0>(a.into()).simd_into(token),
-                    _mm256_extractf128_ps::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx2, bits: u64) -> mask64x2<Avx2> {
+                {
+                    let bit_lanes = _mm_set1_epi64x(bits.cast_signed());
+                    let bit_mask = _mm_set_epi64x(2, 1);
+                    _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+                }
+                .simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, bits)
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f64x4<Avx2> {
-                _mm256_castps_pd(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> u64 {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
+    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_castps_si256(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
+                _mm_and_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u8x32<Avx2> {
-                _mm256_castps_si256(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
+                _mm_or_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u32x8<Avx2> {
-                _mm256_castps_si256(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
+                _mm_xor_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+        self.xor_mask64x2(a, self.splat_mask64x2(true))
+    }
+    #[inline(always)]
+    fn select_mask64x2(
+        self,
+        a: mask64x2<Self>,
+        b: mask64x2<Self>,
+        c: mask64x2<Self>,
+    ) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u32x8<Avx2> {
-                let mut converted = _mm256_cvttps_epi32(a.into());
-                let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0));
-                let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
-                if !all_in_range {
-                    let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0));
-                    let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
-                    converted = _mm256_add_epi32(converted, excess_converted);
-                }
-                converted.simd_into(token)
+            fn kernel(
+                token: Avx2,
+                a: mask64x2<Avx2>,
+                b: mask64x2<Avx2>,
+                c: mask64x2<Avx2>,
+            ) -> mask64x2<Avx2> {
+                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x2<Avx2> {
+                _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u32x8<Avx2> {
-                let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
-                let mut converted = _mm256_cvttps_epi32(a);
-                let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
-                let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
-                if !all_in_range {
-                    let exceeds_unsigned_range = _mm256_castps_si256(_mm256_cmp_ps::<17i32>(
-                        _mm256_set1_ps(4294967040.0),
-                        a,
-                    ));
-                    let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0));
-                    let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
-                    converted = _mm256_add_epi32(converted, excess_converted);
-                    converted = _mm256_blendv_epi8(
-                        converted,
-                        _mm256_set1_epi32(u32::MAX.cast_signed()),
-                        exceeds_unsigned_range,
-                    );
-                }
-                converted.simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_cvttps_epi32(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x8<Avx2>) -> i32x8<Avx2> {
-                let a = a.into();
-                let mut converted = _mm256_cvttps_epi32(a);
-                let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
-                let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
-                if !all_in_range {
-                    converted = _mm256_blendv_epi8(
-                        _mm256_set1_epi32(i32::MAX),
-                        converted,
-                        _mm256_castps_si256(in_range),
-                    );
-                    let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a));
-                    converted = _mm256_and_si256(converted, is_not_nan);
-                }
-                converted.simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: i8) -> i8x32<Avx2> {
-                _mm256_set1_epi8(val).simd_into(token)
+            fn kernel(token: Avx2, a: mask64x2<Avx2>, b: mask64x2<Avx2>) -> mask64x4<Avx2> {
+                _mm256_setr_m128i(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: f32) -> f32x8<Avx2> {
+                _mm256_set1_ps(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
+    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
+    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
+    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0)
+    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
-        i8x32 {
+    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
+        f32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_alignr_256x1(
             self,
-            self.cvt_to_bytes_i8x32(b).val.0,
-            self.cvt_to_bytes_i8x32(a).val.0,
-            SHIFT,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            self.cvt_to_bytes_f32x8(a).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_i8x32(u8x32 {
+        self.cvt_from_bytes_f32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+    fn slide_within_blocks_f32x8<const SHIFT: usize>(
         self,
-        a: i8x32<Self>,
-        b: i8x32<Self>,
-    ) -> i8x32<Self> {
-        if SHIFT >= 16usize {
+        a: f32x8<Self>,
+        b: f32x8<Self>,
+    ) -> f32x8<Self> {
+        if SHIFT >= 4usize {
             return b;
         }
         let result = dyn_alignr_256(
             self,
-            self.cvt_to_bytes_i8x32(b).val.0,
-            self.cvt_to_bytes_i8x32(a).val.0,
-            SHIFT,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            self.cvt_to_bytes_f32x8(a).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_i8x32(u8x32 {
+        self.cvt_from_bytes_f32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_add_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_sub_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                let dst_even = _mm256_mullo_epi16(a.into(), b.into());
-                let dst_odd = _mm256_mullo_epi16(
-                    _mm256_srli_epi16::<8>(a.into()),
-                    _mm256_srli_epi16::<8>(b.into()),
-                );
-                _mm256_or_si256(
-                    _mm256_slli_epi16(dst_odd, 8),
-                    _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_sqrt_ps(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_rcp_ps(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_add_ps(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_sub_ps(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        a ^ !0
+    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_mul_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, shift: u32) -> i8x32<Avx2> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 =
-                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let hi_16 =
-                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
-                let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
-                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_div_ps(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
-    }
-    #[inline(always)]
-    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, shift: u32) -> i8x32<Avx2> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 =
-                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let hi_16 =
-                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
-                let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
-                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                let mask = _mm256_set1_ps(-0.0);
+                _mm256_or_ps(
+                    _mm256_and_ps(mask, b.into()),
+                    _mm256_andnot_ps(mask, a.into()),
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a, shift)
-    }
-    #[inline(always)]
-    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_castps_si256(_mm256_cmp_ps::<0i32>(a.into(), b.into())).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_castps_si256(_mm256_cmp_ps::<17i32>(a.into(), b.into())).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_castps_si256(_mm256_cmp_ps::<18i32>(a.into(), b.into())).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_castps_si256(_mm256_cmp_ps::<29i32>(a.into(), b.into())).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_castps_si256(_mm256_cmp_ps::<30i32>(a.into(), b.into())).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                let lo = _mm256_unpacklo_epi8(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi8(a.into(), b.into());
-                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                let lo = _mm256_unpacklo_ps(a.into(), b.into());
+                let hi = _mm256_unpackhi_ps(a.into(), b.into());
+                _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                let lo = _mm256_unpacklo_epi8(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi8(a.into(), b.into());
-                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                let lo = _mm256_unpacklo_ps(a.into(), b.into());
+                let hi = _mm256_unpackhi_ps(a.into(), b.into());
+                _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
-                    a.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
-                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                    ),
-                ));
-                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
-                    b.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
-                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                    ),
-                ));
-                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                let t1 =
+                    _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+                let t2 =
+                    _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+                _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
-                    a.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
-                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                    ),
-                ));
-                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
-                    b.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
-                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                    ),
-                ));
-                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                let t1 =
+                    _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+                let t2 =
+                    _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+                _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> (i8x32<Avx2>, i8x32<Avx2>) {
-                let lo = _mm256_unpacklo_epi8(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi8(a.into(), b.into());
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> (f32x8<Avx2>, f32x8<Avx2>) {
+                let lo = _mm256_unpacklo_ps(a.into(), b.into());
+                let hi = _mm256_unpackhi_ps(a.into(), b.into());
                 (
-                    _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token),
-                    _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token),
+                    _mm256_permute2f128_ps::<0b0010_0000>(lo, hi).simd_into(token),
+                    _mm256_permute2f128_ps::<0b0011_0001>(lo, hi).simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> (i8x32<Avx2>, i8x32<Avx2>) {
-                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
-                    a.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
-                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                    ),
-                ));
-                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
-                    b.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
-                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-                    ),
-                ));
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> (f32x8<Avx2>, f32x8<Avx2>) {
+                let t1 =
+                    _mm256_permutevar8x32_ps(a.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
+                let t2 =
+                    _mm256_permutevar8x32_ps(b.into(), _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
                 (
-                    _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token),
-                    _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token),
+                    _mm256_permute2f128_ps::<0b0010_0000>(t1, t2).simd_into(token),
+                    _mm256_permute2f128_ps::<0b0011_0001>(t1, t2).simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
+    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask8x32<Avx2>,
-                b: i8x32<Avx2>,
-                c: i8x32<Avx2>,
-            ) -> i8x32<Avx2> {
-                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_max_ps(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_min_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_min_ps(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_max_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                let intermediate = _mm256_max_ps(a.into(), b.into());
+                let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
+                _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
-        i8x64 {
-            val: crate::support::Aligned512([a.val.0, b.val.0]),
-            simd: self,
-        }
+    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>) -> f32x8<Avx2> {
+                let intermediate = _mm256_min_ps(a.into(), b.into());
+                let b_is_nan = _mm256_cmp_ps::<3i32>(b.into(), b.into());
+                _mm256_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>) -> (i8x16<Avx2>, i8x16<Avx2>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>, c: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>) -> i8x32<Avx2> {
-                _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>, b: f32x8<Avx2>, c: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>) -> u8x32<Avx2> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i8x32<Avx2>) -> u32x8<Avx2> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: u8) -> u8x32<Avx2> {
-                _mm256_set1_epi8(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, val)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        a - self.trunc_f32x8(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx2,
+                a: mask32x8<Avx2>,
+                b: f32x8<Avx2>,
+                c: f32x8<Avx2>,
+            ) -> f32x8<Avx2> {
+                _mm256_blendv_ps(c.into(), b.into(), _mm256_castsi256_ps(a.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+        f32x16 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> (f32x4<Avx2>, f32x4<Avx2>) {
+                (
+                    _mm256_extractf128_ps::<0>(a.into()).simd_into(token),
+                    _mm256_extractf128_ps::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> f64x4<Avx2> {
+                _mm256_castps_pd(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> i32x8<Avx2> {
+                _mm256_castps_si256(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u8x32<Avx2> {
+                _mm256_castps_si256(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_castps_si256(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u32x8<Avx2> {
+                let mut converted = _mm256_cvttps_epi32(a.into());
+                let in_range = _mm256_cmp_ps::<17i32>(a.into(), _mm256_set1_ps(2147483648.0));
+                let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
+                if !all_in_range {
+                    let excess = _mm256_sub_ps(a.into(), _mm256_set1_ps(2147483648.0));
+                    let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
+                    converted = _mm256_add_epi32(converted, excess_converted);
+                }
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> u32x8<Avx2> {
+                let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
+                let mut converted = _mm256_cvttps_epi32(a);
+                let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
+                let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
+                if !all_in_range {
+                    let exceeds_unsigned_range = _mm256_castps_si256(_mm256_cmp_ps::<17i32>(
+                        _mm256_set1_ps(4294967040.0),
+                        a,
+                    ));
+                    let excess = _mm256_sub_ps(a, _mm256_set1_ps(2147483648.0));
+                    let excess_converted = _mm256_cvttps_epi32(_mm256_andnot_ps(in_range, excess));
+                    converted = _mm256_add_epi32(converted, excess_converted);
+                    converted = _mm256_blendv_epi8(
+                        converted,
+                        _mm256_set1_epi32(u32::MAX.cast_signed()),
+                        exceeds_unsigned_range,
+                    );
+                }
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> i32x8<Avx2> {
+                _mm256_cvttps_epi32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x8<Avx2>) -> i32x8<Avx2> {
+                let a = a.into();
+                let mut converted = _mm256_cvttps_epi32(a);
+                let in_range = _mm256_cmp_ps::<17i32>(a, _mm256_set1_ps(2147483648.0));
+                let all_in_range = _mm256_movemask_ps(in_range) == 0b11111111;
+                if !all_in_range {
+                    converted = _mm256_blendv_epi8(
+                        _mm256_set1_epi32(i32::MAX),
+                        converted,
+                        _mm256_castps_si256(in_range),
+                    );
+                    let is_not_nan = _mm256_castps_si256(_mm256_cmp_ps::<7i32>(a, a));
+                    converted = _mm256_and_si256(converted, is_not_nan);
+                }
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: i8) -> i8x32<Avx2> {
+                _mm256_set1_epi8(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0)
+    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0)
+    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         if SHIFT >= 32usize {
             return b;
         }
         let result = cross_block_alignr_256x1(
             self,
-            self.cvt_to_bytes_u8x32(b).val.0,
-            self.cvt_to_bytes_u8x32(a).val.0,
+            self.cvt_to_bytes_i8x32(b).val.0,
+            self.cvt_to_bytes_i8x32(a).val.0,
             SHIFT,
         );
-        self.cvt_from_bytes_u8x32(u8x32 {
+        self.cvt_from_bytes_i8x32(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+    fn slide_within_blocks_i8x32<const SHIFT: usize>(
         self,
-        a: u8x32<Self>,
-        b: u8x32<Self>,
-    ) -> u8x32<Self> {
+        a: i8x32<Self>,
+        b: i8x32<Self>,
+    ) -> i8x32<Self> {
         if SHIFT >= 16usize {
             return b;
         }
         let result = dyn_alignr_256(
             self,
-            self.cvt_to_bytes_u8x32(b).val.0,
-            self.cvt_to_bytes_u8x32(a).val.0,
+            self.cvt_to_bytes_i8x32(b).val.0,
+            self.cvt_to_bytes_i8x32(a).val.0,
             SHIFT,
         );
-        self.cvt_from_bytes_u8x32(u8x32 {
+        self.cvt_from_bytes_i8x32(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 _mm256_add_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 _mm256_sub_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 let dst_even = _mm256_mullo_epi16(a.into(), b.into());
                 let dst_odd = _mm256_mullo_epi16(
                     _mm256_srli_epi16::<8>(a.into()),
@@ -5053,140 +5548,210 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, shift: u32) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, shift: u32) -> i8x32<Avx2> {
                 let val = a.into();
                 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
-                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+                let lo_16 =
+                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+                let hi_16 =
+                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
                 let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
                 let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let a: [i8; 32usize] = a.into();
+        let b: [i8; 32usize] = b.into();
+        let result: [i8; 32usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+            core::ops::Shl::shl(a[16usize], b[16usize]),
+            core::ops::Shl::shl(a[17usize], b[17usize]),
+            core::ops::Shl::shl(a[18usize], b[18usize]),
+            core::ops::Shl::shl(a[19usize], b[19usize]),
+            core::ops::Shl::shl(a[20usize], b[20usize]),
+            core::ops::Shl::shl(a[21usize], b[21usize]),
+            core::ops::Shl::shl(a[22usize], b[22usize]),
+            core::ops::Shl::shl(a[23usize], b[23usize]),
+            core::ops::Shl::shl(a[24usize], b[24usize]),
+            core::ops::Shl::shl(a[25usize], b[25usize]),
+            core::ops::Shl::shl(a[26usize], b[26usize]),
+            core::ops::Shl::shl(a[27usize], b[27usize]),
+            core::ops::Shl::shl(a[28usize], b[28usize]),
+            core::ops::Shl::shl(a[29usize], b[29usize]),
+            core::ops::Shl::shl(a[30usize], b[30usize]),
+            core::ops::Shl::shl(a[31usize], b[31usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, shift: u32) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, shift: u32) -> i8x32<Avx2> {
                 let val = a.into();
                 let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
-                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
-                let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
-                let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+                let lo_16 =
+                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+                let hi_16 =
+                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+                let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
+                let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
+                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let a: [i8; 32usize] = a.into();
+        let b: [i8; 32usize] = b.into();
+        let result: [i8; 32usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+            core::ops::Shr::shr(a[16usize], b[16usize]),
+            core::ops::Shr::shr(a[17usize], b[17usize]),
+            core::ops::Shr::shr(a[18usize], b[18usize]),
+            core::ops::Shr::shr(a[19usize], b[19usize]),
+            core::ops::Shr::shr(a[20usize], b[20usize]),
+            core::ops::Shr::shr(a[21usize], b[21usize]),
+            core::ops::Shr::shr(a[22usize], b[22usize]),
+            core::ops::Shr::shr(a[23usize], b[23usize]),
+            core::ops::Shr::shr(a[24usize], b[24usize]),
+            core::ops::Shr::shr(a[25usize], b[25usize]),
+            core::ops::Shr::shr(a[26usize], b[26usize]),
+            core::ops::Shr::shr(a[27usize], b[27usize]),
+            core::ops::Shr::shr(a[28usize], b[28usize]),
+            core::ops::Shr::shr(a[29usize], b[29usize]),
+            core::ops::Shr::shr(a[30usize], b[30usize]),
+            core::ops::Shr::shr(a[31usize], b[31usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
                 _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
-                let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
-                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
-                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
-                _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(token)
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpgt_epi8(b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpeq_epi8(_mm256_min_epi8(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpeq_epi8(_mm256_max_epi8(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
-                let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
-                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
-                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
-                _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(token)
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpgt_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
                 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
@@ -5195,10 +5760,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
                 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
@@ -5207,10 +5772,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
@@ -5231,10 +5796,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
@@ -5255,10 +5820,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> (u8x32<Avx2>, u8x32<Avx2>) {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> (i8x32<Avx2>, i8x32<Avx2>) {
                 let lo = _mm256_unpacklo_epi8(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi8(a.into(), b.into());
                 (
@@ -5270,10 +5835,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> (u8x32<Avx2>, u8x32<Avx2>) {
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> (i8x32<Avx2>, i8x32<Avx2>) {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
@@ -5297,52 +5862,52 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx2,
                 a: mask8x32<Avx2>,
-                b: u8x32<Avx2>,
-                c: u8x32<Avx2>,
-            ) -> u8x32<Avx2> {
+                b: i8x32<Avx2>,
+                c: i8x32<Avx2>,
+            ) -> i8x32<Avx2> {
                 _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
-                _mm256_min_epu8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
+                _mm256_min_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
-                _mm256_max_epu8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i8x32<Avx2>, b: i8x32<Avx2>) -> i8x32<Avx2> {
+                _mm256_max_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+        i8x64 {
             val: crate::support::Aligned512([a.val.0, b.val.0]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>) -> (u8x16<Avx2>, u8x16<Avx2>) {
+            fn kernel(token: Avx2, a: i8x32<Avx2>) -> (i8x16<Avx2>, i8x16<Avx2>) {
                 (
                     _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
                     _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
@@ -5352,493 +5917,407 @@ impl Simd for Avx2 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>) -> u16x32<Avx2> {
-                let (a0, a1) = token.split_u8x32(a);
-                let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(token);
-                let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(token);
-                token.combine_u16x16(high, low)
+            fn kernel(token: Avx2, a: i8x32<Avx2>) -> i8x32<Avx2> {
+                _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x32<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i8x32<Avx2>) -> u8x32<Avx2> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
+    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: bool) -> mask8x32<Avx2> {
-                let val: i8 = if val { !0 } else { 0 };
-                _mm256_set1_epi8(val).simd_into(token)
+            fn kernel(token: Avx2, a: i8x32<Avx2>) -> u32x8<Avx2> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: u8) -> u8x32<Avx2> {
+                _mm256_set1_epi8(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
-        mask8x32 {
+    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
+    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, bits: u64) -> mask8x32<Avx2> {
-                {
-                    let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32));
-                    let bit_bytes = _mm256_shuffle_epi8(
-                        bit_bytes,
-                        _mm256_setr_epi8(
-                            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
-                            3, 3, 3, 3, 3, 3, 3, 3,
-                        ),
-                    );
-                    let bit_mask = _mm256_setr_epi8(
-                        1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16,
-                        32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128,
-                    );
-                    _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
-                }
-                .simd_into(token)
-            }
-        );
-        kernel(self, bits)
+    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> u64 {
-                _mm256_movemask_epi8(a.into()) as u32 as u64
-            }
-        );
-        kernel(self, a)
+    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 32usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32usize
-        );
-        let mut lanes = self.as_array_mask8x32(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x32(lanes);
+    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
-        self.xor_mask8x32(a, self.splat_mask8x32(true))
+    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        if SHIFT >= 32usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            self.cvt_to_bytes_u8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask8x32(
+    fn slide_within_blocks_u8x32<const SHIFT: usize>(
         self,
-        a: mask8x32<Self>,
-        b: mask8x32<Self>,
-        c: mask8x32<Self>,
-    ) -> mask8x32<Self> {
+        a: u8x32<Self>,
+        b: u8x32<Self>,
+    ) -> u8x32<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            self.cvt_to_bytes_u8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask8x32<Avx2>,
-                b: mask8x32<Avx2>,
-                c: mask8x32<Avx2>,
-            ) -> mask8x32<Avx2> {
-                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                _mm256_add_epi8(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
-                _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                _mm256_sub_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 != 0
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+                let dst_odd = _mm256_mullo_epi16(
+                    _mm256_srli_epi16::<8>(a.into()),
+                    _mm256_srli_epi16::<8>(b.into()),
+                );
+                _mm256_or_si256(
+                    _mm256_slli_epi16(dst_odd, 8),
+                    _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 == 0
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: crate::support::Aligned512([a.val.0, b.val.0]),
-            simd: self,
-        }
+    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        a ^ !0
     }
     #[inline(always)]
-    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
+    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> (mask8x16<Avx2>, mask8x16<Avx2>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: u8x32<Avx2>, shift: u32) -> u8x32<Avx2> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+                let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
+                let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let a: [u8; 32usize] = a.into();
+        let b: [u8; 32usize] = b.into();
+        let result: [u8; 32usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+            core::ops::Shl::shl(a[16usize], b[16usize]),
+            core::ops::Shl::shl(a[17usize], b[17usize]),
+            core::ops::Shl::shl(a[18usize], b[18usize]),
+            core::ops::Shl::shl(a[19usize], b[19usize]),
+            core::ops::Shl::shl(a[20usize], b[20usize]),
+            core::ops::Shl::shl(a[21usize], b[21usize]),
+            core::ops::Shl::shl(a[22usize], b[22usize]),
+            core::ops::Shl::shl(a[23usize], b[23usize]),
+            core::ops::Shl::shl(a[24usize], b[24usize]),
+            core::ops::Shl::shl(a[25usize], b[25usize]),
+            core::ops::Shl::shl(a[26usize], b[26usize]),
+            core::ops::Shl::shl(a[27usize], b[27usize]),
+            core::ops::Shl::shl(a[28usize], b[28usize]),
+            core::ops::Shl::shl(a[29usize], b[29usize]),
+            core::ops::Shl::shl(a[30usize], b[30usize]),
+            core::ops::Shl::shl(a[31usize], b[31usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: i16) -> i16x16<Avx2> {
-                _mm256_set1_epi16(val).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, shift: u32) -> u8x32<Avx2> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+                let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
+                let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
-        kernel(self, val)
-    }
-    #[inline(always)]
-    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0)
-    }
-    #[inline(always)]
-    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
-    }
-    #[inline(always)]
-    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_alignr_256x1(
-            self,
-            self.cvt_to_bytes_i16x16(b).val.0,
-            self.cvt_to_bytes_i16x16(a).val.0,
-            SHIFT * 2usize,
-        );
-        self.cvt_from_bytes_i16x16(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let a: [u8; 32usize] = a.into();
+        let b: [u8; 32usize] = b.into();
+        let result: [u8; 32usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+            core::ops::Shr::shr(a[16usize], b[16usize]),
+            core::ops::Shr::shr(a[17usize], b[17usize]),
+            core::ops::Shr::shr(a[18usize], b[18usize]),
+            core::ops::Shr::shr(a[19usize], b[19usize]),
+            core::ops::Shr::shr(a[20usize], b[20usize]),
+            core::ops::Shr::shr(a[21usize], b[21usize]),
+            core::ops::Shr::shr(a[22usize], b[22usize]),
+            core::ops::Shr::shr(a[23usize], b[23usize]),
+            core::ops::Shr::shr(a[24usize], b[24usize]),
+            core::ops::Shr::shr(a[25usize], b[25usize]),
+            core::ops::Shr::shr(a[26usize], b[26usize]),
+            core::ops::Shr::shr(a[27usize], b[27usize]),
+            core::ops::Shr::shr(a[28usize], b[28usize]),
+            core::ops::Shr::shr(a[29usize], b[29usize]),
+            core::ops::Shr::shr(a[30usize], b[30usize]),
+            core::ops::Shr::shr(a[31usize], b[31usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x16<const SHIFT: usize>(
-        self,
-        a: i16x16<Self>,
-        b: i16x16<Self>,
-    ) -> i16x16<Self> {
-        if SHIFT >= 8usize {
-            return b;
-        }
-        let result = dyn_alignr_256(
-            self,
-            self.cvt_to_bytes_i16x16(b).val.0,
-            self.cvt_to_bytes_i16x16(a).val.0,
-            SHIFT * 2usize,
+    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token)
+            }
         );
-        self.cvt_from_bytes_i16x16(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_add_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
+                let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
+                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
+                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
+                _mm256_cmpgt_epi8(b_signed, a_signed).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_sub_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpeq_epi8(_mm256_min_epu8(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_mullo_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpeq_epi8(_mm256_max_epu8(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> mask8x32<Avx2> {
+                let sign_bit = _mm256_set1_epi8(0x80u8.cast_signed());
+                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
+                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
+                _mm256_cmpgt_epi8(a_signed, b_signed).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                let lo = _mm256_unpacklo_epi8(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi8(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                let lo = _mm256_unpacklo_epi8(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi8(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        a ^ !0
-    }
-    #[inline(always)]
-    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, shift: u32) -> i16x16<Avx2> {
-                _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
-            }
-        );
-        kernel(self, a, shift)
-    }
-    #[inline(always)]
-    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
-    }
-    #[inline(always)]
-    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, shift: u32) -> i16x16<Avx2> {
-                _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
-            }
-        );
-        kernel(self, a, shift)
-    }
-    #[inline(always)]
-    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
-    }
-    #[inline(always)]
-    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                let lo = _mm256_unpacklo_epi16(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi16(a.into(), b.into());
-                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                let lo = _mm256_unpacklo_epi16(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi16(a.into(), b.into());
-                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
-                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
-                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
+                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                     ),
                 ));
                 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     b.into(),
                     _mm256_setr_epi8(
-                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
-                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
+                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                     ),
                 ));
                 _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token)
@@ -5847,22 +6326,22 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
-                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
-                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
+                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                     ),
                 ));
                 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     b.into(),
                     _mm256_setr_epi8(
-                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
-                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
+                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                     ),
                 ));
                 _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token)
@@ -5871,16 +6350,12 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: i16x16<Avx2>,
-                b: i16x16<Avx2>,
-            ) -> (i16x16<Avx2>, i16x16<Avx2>) {
-                let lo = _mm256_unpacklo_epi16(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi16(a.into(), b.into());
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> (u8x32<Avx2>, u8x32<Avx2>) {
+                let lo = _mm256_unpacklo_epi8(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi8(a.into(), b.into());
                 (
                     _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token),
                     _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token),
@@ -5890,26 +6365,22 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: i16x16<Avx2>,
-                b: i16x16<Avx2>,
-            ) -> (i16x16<Avx2>, i16x16<Avx2>) {
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> (u8x32<Avx2>, u8x32<Avx2>) {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
-                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
-                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
+                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                     ),
                 ));
                 let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     b.into(),
                     _mm256_setr_epi8(
-                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
-                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10,
+                        12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
                     ),
                 ));
                 (
@@ -5921,52 +6392,52 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
+    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx2,
-                a: mask16x16<Avx2>,
-                b: i16x16<Avx2>,
-                c: i16x16<Avx2>,
-            ) -> i16x16<Avx2> {
+                a: mask8x32<Avx2>,
+                b: u8x32<Avx2>,
+                c: u8x32<Avx2>,
+            ) -> u8x32<Avx2> {
                 _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_min_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                _mm256_min_epu8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_max_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>, b: u8x32<Avx2>) -> u8x32<Avx2> {
+                _mm256_max_epu8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+        u8x64 {
             val: crate::support::Aligned512([a.val.0, b.val.0]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>) -> (i16x8<Avx2>, i16x8<Avx2>) {
+            fn kernel(token: Avx2, a: u8x32<Avx2>) -> (u8x16<Avx2>, u8x16<Avx2>) {
                 (
                     _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
                     _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
@@ -5976,278 +6447,497 @@ impl Simd for Avx2 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>) -> i16x16<Avx2> {
-                _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>) -> u8x32<Avx2> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx2, a: u8x32<Avx2>) -> u16x32<Avx2> {
+                let (a0, a1) = token.split_u8x32(a);
+                let high = _mm256_cvtepu8_epi16(a0.into()).simd_into(token);
+                let low = _mm256_cvtepu8_epi16(a1.into()).simd_into(token);
+                token.combine_u16x16(high, low)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i16x16<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: u8x32<Avx2>) -> u32x8<Avx2> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: u16) -> u16x16<Avx2> {
-                _mm256_set1_epi16(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx2, val: bool) -> mask8x32<Avx2> {
+                let val: i8 = if val { !0 } else { 0 };
+                _mm256_set1_epi8(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+        mask8x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0)
+    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0)
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, bits: u64) -> mask8x32<Avx2> {
+                {
+                    let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32));
+                    let bit_bytes = _mm256_shuffle_epi8(
+                        bit_bytes,
+                        _mm256_setr_epi8(
+                            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+                            3, 3, 3, 3, 3, 3, 3, 3,
+                        ),
+                    );
+                    let bit_mask = _mm256_setr_epi8(
+                        1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16,
+                        32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128,
+                    );
+                    _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+                }
+                .simd_into(token)
+            }
+        );
+        kernel(self, bits)
     }
     #[inline(always)]
-    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0)
-    }
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> u64 {
+                _mm256_movemask_epi8(a.into()) as u32 as u64
+            }
+        );
+        kernel(self, a)
+    }
     #[inline(always)]
-    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
+    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
+        self.xor_mask8x32(a, self.splat_mask8x32(true))
+    }
+    #[inline(always)]
+    fn select_mask8x32(
+        self,
+        a: mask8x32<Self>,
+        b: mask8x32<Self>,
+        c: mask8x32<Self>,
+    ) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx2,
+                a: mask8x32<Avx2>,
+                b: mask8x32<Avx2>,
+                c: mask8x32<Avx2>,
+            ) -> mask8x32<Avx2> {
+                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>, b: mask8x32<Avx2>) -> mask8x32<Avx2> {
+                _mm256_cmpeq_epi8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 != 0
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 == 0
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask8x32<Avx2>) -> (mask8x16<Avx2>, mask8x16<Avx2>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: i16) -> i16x16<Avx2> {
+                _mm256_set1_epi16(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
-        u16x16 {
+    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
+        i16x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_alignr_256x1(
             self,
-            self.cvt_to_bytes_u16x16(b).val.0,
-            self.cvt_to_bytes_u16x16(a).val.0,
+            self.cvt_to_bytes_i16x16(b).val.0,
+            self.cvt_to_bytes_i16x16(a).val.0,
             SHIFT * 2usize,
         );
-        self.cvt_from_bytes_u16x16(u8x32 {
+        self.cvt_from_bytes_i16x16(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+    fn slide_within_blocks_i16x16<const SHIFT: usize>(
         self,
-        a: u16x16<Self>,
-        b: u16x16<Self>,
-    ) -> u16x16<Self> {
+        a: i16x16<Self>,
+        b: i16x16<Self>,
+    ) -> i16x16<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = dyn_alignr_256(
             self,
-            self.cvt_to_bytes_u16x16(b).val.0,
-            self.cvt_to_bytes_u16x16(a).val.0,
+            self.cvt_to_bytes_i16x16(b).val.0,
+            self.cvt_to_bytes_i16x16(a).val.0,
             SHIFT * 2usize,
         );
-        self.cvt_from_bytes_u16x16(u8x32 {
+        self.cvt_from_bytes_i16x16(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 _mm256_add_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 _mm256_sub_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 _mm256_mullo_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, shift: u32) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, shift: u32) -> i16x16<Avx2> {
                 _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let a: [i16; 16usize] = a.into();
+        let b: [i16; 16usize] = b.into();
+        let result: [i16; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, shift: u32) -> u16x16<Avx2> {
-                _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>, shift: u32) -> i16x16<Avx2> {
+                _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let a: [i16; 16usize] = a.into();
+        let b: [i16; 16usize] = b.into();
+        let result: [i16; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
                 _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
-                let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
-                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
-                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
-                _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpgt_epi16(b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpeq_epi16(_mm256_min_epi16(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpeq_epi16(_mm256_max_epi16(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
-                let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
-                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
-                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
-                _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpgt_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
                 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
@@ -6256,10 +6946,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
                 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
@@ -6268,10 +6958,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
@@ -6292,10 +6982,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
@@ -6316,14 +7006,14 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx2,
-                a: u16x16<Avx2>,
-                b: u16x16<Avx2>,
-            ) -> (u16x16<Avx2>, u16x16<Avx2>) {
+                a: i16x16<Avx2>,
+                b: i16x16<Avx2>,
+            ) -> (i16x16<Avx2>, i16x16<Avx2>) {
                 let lo = _mm256_unpacklo_epi16(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi16(a.into(), b.into());
                 (
@@ -6335,14 +7025,14 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx2,
-                a: u16x16<Avx2>,
-                b: u16x16<Avx2>,
-            ) -> (u16x16<Avx2>, u16x16<Avx2>) {
+                a: i16x16<Avx2>,
+                b: i16x16<Avx2>,
+            ) -> (i16x16<Avx2>, i16x16<Avx2>) {
                 let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
                     a.into(),
                     _mm256_setr_epi8(
@@ -6366,52 +7056,52 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx2,
                 a: mask16x16<Avx2>,
-                b: u16x16<Avx2>,
-                c: u16x16<Avx2>,
-            ) -> u16x16<Avx2> {
+                b: i16x16<Avx2>,
+                c: i16x16<Avx2>,
+            ) -> i16x16<Avx2> {
                 _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
-                _mm256_min_epu16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
+                _mm256_min_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
-                _mm256_max_epu16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>, b: i16x16<Avx2>) -> i16x16<Avx2> {
+                _mm256_max_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+        i16x32 {
             val: crate::support::Aligned512([a.val.0, b.val.0]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>) -> (u16x8<Avx2>, u16x8<Avx2>) {
+            fn kernel(token: Avx2, a: i16x16<Avx2>) -> (i16x8<Avx2>, i16x8<Avx2>) {
                 (
                     _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
                     _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
@@ -6421,921 +7111,965 @@ impl Simd for Avx2 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>) -> u8x16<Avx2> {
-                let mask = _mm256_setr_epi8(
-                    0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10,
-                    12, 14, -1, -1, -1, -1, -1, -1, -1, -1,
-                );
-                let shuffled = _mm256_shuffle_epi8(a.into(), mask);
-                let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled);
-                _mm256_castsi256_si128(packed).simd_into(token)
+            fn kernel(token: Avx2, a: i16x16<Avx2>) -> i16x16<Avx2> {
+                _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>) -> u8x32<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>) -> u8x32<Avx2> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x16<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i16x16<Avx2>) -> u32x8<Avx2> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
+    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: bool) -> mask16x16<Avx2> {
-                let val: i16 = if val { !0 } else { 0 };
-                _mm256_set1_epi16(val).simd_into(token)
+            fn kernel(token: Avx2, val: u16) -> u16x16<Avx2> {
+                _mm256_set1_epi16(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
-        mask16x16 {
+    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
+    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            self.cvt_to_bytes_u16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+        self,
+        a: u16x16<Self>,
+        b: u16x16<Self>,
+    ) -> u16x16<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            self.cvt_to_bytes_u16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, bits: u64) -> mask16x16<Avx2> {
-                {
-                    let bit_lanes = _mm256_set1_epi16(bits as i16);
-                    let bit_mask = _mm256_setr_epi16(
-                        1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384,
-                        -32768,
-                    );
-                    _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
-                }
-                .simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                _mm256_add_epi16(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, bits)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> u64 {
-                {
-                    let halves: [__m128i; 2usize] =
-                        crate::transmute::checked_transmute_copy(&a.val.0);
-                    let packed = _mm_packs_epi16(halves[0], halves[1]);
-                    _mm_movemask_epi8(packed) as u32 as u64
-                }
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                _mm256_sub_epi16(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
+    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                _mm256_mullo_epi16(a.into(), b.into()).simd_into(token)
+            }
         );
-        let mut lanes = self.as_array_mask16x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x16(lanes);
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
-        self.xor_mask16x16(a, self.splat_mask16x16(true))
+    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+        a ^ !0
     }
     #[inline(always)]
-    fn select_mask16x16(
-        self,
-        a: mask16x16<Self>,
-        b: mask16x16<Self>,
-        c: mask16x16<Self>,
-    ) -> mask16x16<Self> {
+    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask16x16<Avx2>,
-                b: mask16x16<Avx2>,
-                c: mask16x16<Avx2>,
-            ) -> mask16x16<Avx2> {
-                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, shift: u32) -> u16x16<Avx2> {
+                _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
-                _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let a: [u16; 16usize] = a.into();
+        let b: [u16; 16usize] = b.into();
+        let result: [u16; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 != 0
+            fn kernel(token: Avx2, a: u16x16<Avx2>, shift: u32) -> u16x16<Avx2> {
+                _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff
-            }
-        );
-        kernel(self, a)
+    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let a: [u16; 16usize] = a.into();
+        let b: [u16; 16usize] = b.into();
+        let result: [u16; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
-                _mm256_movemask_epi8(a.into()) as u32 == 0
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
+                let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
+                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
+                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
+                _mm256_cmpgt_epi16(b_signed, a_signed).simd_into(token)
             }
         );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: crate::support::Aligned512([a.val.0, b.val.0]),
-            simd: self,
-        }
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
+    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> (mask16x8<Avx2>, mask16x8<Avx2>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpeq_epi16(_mm256_min_epu16(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: i32) -> i32x8<Avx2> {
-                _mm256_set1_epi32(val).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpeq_epi16(_mm256_max_epu16(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
-        kernel(self, val)
-    }
-    #[inline(always)]
-    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0)
-    }
-    #[inline(always)]
-    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
-    }
-    #[inline(always)]
-    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
-        i32x8 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        if SHIFT >= 8usize {
-            return b;
-        }
-        let result = cross_block_alignr_256x1(
-            self,
-            self.cvt_to_bytes_i32x8(b).val.0,
-            self.cvt_to_bytes_i32x8(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_i32x8(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn slide_within_blocks_i32x8<const SHIFT: usize>(
-        self,
-        a: i32x8<Self>,
-        b: i32x8<Self>,
-    ) -> i32x8<Self> {
-        if SHIFT >= 4usize {
-            return b;
-        }
-        let result = dyn_alignr_256(
-            self,
-            self.cvt_to_bytes_i32x8(b).val.0,
-            self.cvt_to_bytes_i32x8(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_i32x8(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_add_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> mask16x16<Avx2> {
+                let sign_bit = _mm256_set1_epi16(0x8000u16.cast_signed());
+                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
+                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
+                _mm256_cmpgt_epi16(a_signed, b_signed).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_sub_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                let lo = _mm256_unpacklo_epi16(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi16(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_mullo_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                let lo = _mm256_unpacklo_epi16(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi16(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
+                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                    ),
+                ));
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
+                    b.into(),
+                    _mm256_setr_epi8(
+                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
+                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                    ),
+                ));
+                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
+                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                    ),
+                ));
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
+                    b.into(),
+                    _mm256_setr_epi8(
+                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
+                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                    ),
+                ));
+                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(
+                token: Avx2,
+                a: u16x16<Avx2>,
+                b: u16x16<Avx2>,
+            ) -> (u16x16<Avx2>, u16x16<Avx2>) {
+                let lo = _mm256_unpacklo_epi16(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi16(a.into(), b.into());
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token),
+                )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        a ^ !0
-    }
-    #[inline(always)]
-    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, shift: u32) -> i32x8<Avx2> {
-                _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
-            }
-        );
-        kernel(self, a, shift)
-    }
-    #[inline(always)]
-    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_sllv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(
+                token: Avx2,
+                a: u16x16<Avx2>,
+                b: u16x16<Avx2>,
+            ) -> (u16x16<Avx2>, u16x16<Avx2>) {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
+                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                    ),
+                ));
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(_mm256_shuffle_epi8(
+                    b.into(),
+                    _mm256_setr_epi8(
+                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 12,
+                        13, 2, 3, 6, 7, 10, 11, 14, 15,
+                    ),
+                ));
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token),
+                )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, shift: u32) -> i32x8<Avx2> {
-                _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(
+                token: Avx2,
+                a: mask16x16<Avx2>,
+                b: u16x16<Avx2>,
+                c: u16x16<Avx2>,
+            ) -> u16x16<Avx2> {
+                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_srav_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                _mm256_min_epu16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>, b: u16x16<Avx2>) -> u16x16<Avx2> {
+                _mm256_max_epu16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+        u16x32 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>) -> (u16x8<Avx2>, u16x8<Avx2>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>) -> u8x16<Avx2> {
+                let mask = _mm256_setr_epi8(
+                    0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1, 0, 2, 4, 6, 8, 10,
+                    12, 14, -1, -1, -1, -1, -1, -1, -1, -1,
+                );
+                let shuffled = _mm256_shuffle_epi8(a.into(), mask);
+                let packed = _mm256_permute4x64_epi64::<0b11_01_10_00>(shuffled);
+                _mm256_castsi256_si128(packed).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>) -> u8x32<Avx2> {
+                __m256i::from(a).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u16x16<Avx2>) -> u32x8<Avx2> {
+                __m256i::from(a).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                let lo = _mm256_unpacklo_epi32(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi32(a.into(), b.into());
-                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
+            fn kernel(token: Avx2, val: bool) -> mask16x16<Avx2> {
+                let val: i16 = if val { !0 } else { 0 };
+                _mm256_set1_epi16(val).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, val)
     }
     #[inline(always)]
-    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
+        mask16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                let lo = _mm256_unpacklo_epi32(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi32(a.into(), b.into());
-                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
+            fn kernel(token: Avx2, bits: u64) -> mask16x16<Avx2> {
+                {
+                    let bit_lanes = _mm256_set1_epi16(bits as i16);
+                    let bit_mask = _mm256_setr_epi16(
+                        1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384,
+                        -32768,
+                    );
+                    _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                }
+                .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, bits)
     }
     #[inline(always)]
-    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                let t1 = _mm256_permutevar8x32_epi32(
-                    a.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
-                );
-                let t2 = _mm256_permutevar8x32_epi32(
-                    b.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
-                );
-                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token)
+            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> u64 {
+                {
+                    let halves: [__m128i; 2usize] =
+                        crate::transmute::checked_transmute_copy(&a.val.0);
+                    let packed = _mm_packs_epi16(halves[0], halves[1]);
+                    _mm_movemask_epi8(packed) as u32 as u64
+                }
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
+    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                let t1 = _mm256_permutevar8x32_epi32(
-                    a.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
-                );
-                let t2 = _mm256_permutevar8x32_epi32(
-                    b.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
-                );
-                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token)
+            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> (i32x8<Avx2>, i32x8<Avx2>) {
-                let lo = _mm256_unpacklo_epi32(a.into(), b.into());
-                let hi = _mm256_unpackhi_epi32(a.into(), b.into());
-                (
-                    _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token),
-                    _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> (i32x8<Avx2>, i32x8<Avx2>) {
-                let t1 = _mm256_permutevar8x32_epi32(
-                    a.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
-                );
-                let t2 = _mm256_permutevar8x32_epi32(
-                    b.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
-                );
-                (
-                    _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token),
-                    _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
+    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
+        self.xor_mask16x16(a, self.splat_mask16x16(true))
+    }
+    #[inline(always)]
+    fn select_mask16x16(
+        self,
+        a: mask16x16<Self>,
+        b: mask16x16<Self>,
+        c: mask16x16<Self>,
+    ) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx2,
-                a: mask32x8<Avx2>,
-                b: i32x8<Avx2>,
-                c: i32x8<Avx2>,
-            ) -> i32x8<Avx2> {
+                a: mask16x16<Avx2>,
+                b: mask16x16<Avx2>,
+                c: mask16x16<Avx2>,
+            ) -> mask16x16<Avx2> {
                 _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_min_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask16x16<Avx2>, b: mask16x16<Avx2>) -> mask16x16<Avx2> {
+                _mm256_cmpeq_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_max_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 != 0
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
-        i32x16 {
-            val: crate::support::Aligned512([a.val.0, b.val.0]),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
+    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>) -> (i32x4<Avx2>, i32x4<Avx2>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 == 0xffffffff
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>) -> i32x8<Avx2> {
-                _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 != 0xffffffff
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>) -> u8x32<Avx2> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> bool {
+                _mm256_movemask_epi8(a.into()) as u32 == 0
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>) -> u32x8<Avx2> {
-                __m256i::from(a).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
+    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: i32x8<Avx2>) -> f32x8<Avx2> {
-                _mm256_cvtepi32_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask16x16<Avx2>) -> (mask16x8<Avx2>, mask16x8<Avx2>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: u32) -> u32x8<Avx2> {
-                _mm256_set1_epi32(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx2, val: i32) -> i32x8<Avx2> {
+                _mm256_set1_epi32(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0)
+    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0)
+    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
-        u32x8 {
+    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_alignr_256x1(
             self,
-            self.cvt_to_bytes_u32x8(b).val.0,
-            self.cvt_to_bytes_u32x8(a).val.0,
+            self.cvt_to_bytes_i32x8(b).val.0,
+            self.cvt_to_bytes_i32x8(a).val.0,
             SHIFT * 4usize,
         );
-        self.cvt_from_bytes_u32x8(u8x32 {
+        self.cvt_from_bytes_i32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+    fn slide_within_blocks_i32x8<const SHIFT: usize>(
         self,
-        a: u32x8<Self>,
-        b: u32x8<Self>,
-    ) -> u32x8<Self> {
+        a: i32x8<Self>,
+        b: i32x8<Self>,
+    ) -> i32x8<Self> {
         if SHIFT >= 4usize {
             return b;
         }
         let result = dyn_alignr_256(
             self,
-            self.cvt_to_bytes_u32x8(b).val.0,
-            self.cvt_to_bytes_u32x8(a).val.0,
+            self.cvt_to_bytes_i32x8(b).val.0,
+            self.cvt_to_bytes_i32x8(a).val.0,
             SHIFT * 4usize,
         );
-        self.cvt_from_bytes_u32x8(u8x32 {
+        self.cvt_from_bytes_i32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 _mm256_add_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 _mm256_sub_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 _mm256_mullo_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
+    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, shift: u32) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, shift: u32) -> i32x8<Avx2> {
                 _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 _mm256_sllv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, shift: u32) -> u32x8<Avx2> {
-                _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, shift: u32) -> i32x8<Avx2> {
+                _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
-                _mm256_srlv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
+                _mm256_srav_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
                 _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
-                let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
-                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
-                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
-                _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpgt_epi32(b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpeq_epi32(_mm256_min_epi32(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpeq_epi32(_mm256_max_epi32(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
-                let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
-                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
-                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
-                _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpgt_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
                 _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
@@ -7344,10 +8078,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
                 _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
@@ -7356,10 +8090,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 let t1 = _mm256_permutevar8x32_epi32(
                     a.into(),
                     _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
@@ -7374,10 +8108,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
                 let t1 = _mm256_permutevar8x32_epi32(
                     a.into(),
                     _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
@@ -7392,10 +8126,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> (u32x8<Avx2>, u32x8<Avx2>) {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> (i32x8<Avx2>, i32x8<Avx2>) {
                 let lo = _mm256_unpacklo_epi32(a.into(), b.into());
                 let hi = _mm256_unpackhi_epi32(a.into(), b.into());
                 (
@@ -7407,10 +8141,10 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> (u32x8<Avx2>, u32x8<Avx2>) {
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> (i32x8<Avx2>, i32x8<Avx2>) {
                 let t1 = _mm256_permutevar8x32_epi32(
                     a.into(),
                     _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
@@ -7428,52 +8162,52 @@ impl Simd for Avx2 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx2,
                 a: mask32x8<Avx2>,
-                b: u32x8<Avx2>,
-                c: u32x8<Avx2>,
-            ) -> u32x8<Avx2> {
+                b: i32x8<Avx2>,
+                c: i32x8<Avx2>,
+            ) -> i32x8<Avx2> {
                 _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
-                _mm256_min_epu32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
+                _mm256_min_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
-                _mm256_max_epu32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>, b: i32x8<Avx2>) -> i32x8<Avx2> {
+                _mm256_max_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
-        u32x16 {
+    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
+        i32x16 {
             val: crate::support::Aligned512([a.val.0, b.val.0]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>) -> (u32x4<Avx2>, u32x4<Avx2>) {
+            fn kernel(token: Avx2, a: i32x8<Avx2>) -> (i32x4<Avx2>, i32x4<Avx2>) {
                 (
                     _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
                     _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
@@ -7483,1257 +8217,3373 @@ impl Simd for Avx2 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>) -> u8x32<Avx2> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>) -> i32x8<Avx2> {
+                _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u32x8<Avx2>) -> f32x8<Avx2> {
-                let a = a.into();
-                let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000));
-                let hi = _mm256_blend_epi16::<0xAA>(
-                    _mm256_srli_epi32::<16>(a),
-                    _mm256_set1_epi32(0x53000000),
-                );
-                let fhi = _mm256_sub_ps(
-                    _mm256_castsi256_ps(hi),
-                    _mm256_set1_ps(f32::from_bits(0x53000080)),
-                );
-                let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi);
-                result.simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>) -> u8x32<Avx2> {
+                __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
+    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: bool) -> mask32x8<Avx2> {
-                let val: i32 = if val { !0 } else { 0 };
-                _mm256_set1_epi32(val).simd_into(token)
+            fn kernel(token: Avx2, a: i32x8<Avx2>) -> u32x8<Avx2> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i32x8<Avx2>) -> f32x8<Avx2> {
+                _mm256_cvtepi32_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: u32) -> u32x8<Avx2> {
+                _mm256_set1_epi32(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
-        mask32x8 {
+    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
+    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            self.cvt_to_bytes_u32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+        self,
+        a: u32x8<Self>,
+        b: u32x8<Self>,
+    ) -> u32x8<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            self.cvt_to_bytes_u32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, bits: u64) -> mask32x8<Avx2> {
-                {
-                    let bit_lanes = _mm256_set1_epi32(bits as i32);
-                    let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
-                    _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
-                }
-                .simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_add_epi32(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, bits)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> u64 {
-                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_sub_epi32(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 8usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8usize
+    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_mullo_epi32(a.into(), b.into()).simd_into(token)
+            }
         );
-        let mut lanes = self.as_array_mask32x8(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x8(lanes);
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
-        self.xor_mask32x8(a, self.splat_mask32x8(true))
+    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
+        a ^ !0
     }
     #[inline(always)]
-    fn select_mask32x8(
-        self,
-        a: mask32x8<Self>,
-        b: mask32x8<Self>,
-        c: mask32x8<Self>,
-    ) -> mask32x8<Self> {
+    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask32x8<Avx2>,
-                b: mask32x8<Avx2>,
-                c: mask32x8<Avx2>,
-            ) -> mask32x8<Avx2> {
-                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, shift: u32) -> u32x8<Avx2> {
+                _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
-                _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_sllv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
-                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0
+            fn kernel(token: Avx2, a: u32x8<Avx2>, shift: u32) -> u32x8<Avx2> {
+                _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
-                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_srlv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
-                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
-                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
+                let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
+                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
+                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
+                _mm256_cmpgt_epi32(b_signed, a_signed).simd_into(token)
             }
         );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::support::Aligned512([a.val.0, b.val.0]),
-            simd: self,
-        }
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
+    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> (mask32x4<Avx2>, mask32x4<Avx2>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpeq_epi32(_mm256_min_epu32(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
+    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: f64) -> f64x4<Avx2> {
-                _mm256_set1_pd(val).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpeq_epi32(_mm256_max_epu32(a.into(), b.into()), a.into()).simd_into(token)
             }
         );
-        kernel(self, val)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> mask32x8<Avx2> {
+                let sign_bit = _mm256_set1_epi32(0x80000000u32.cast_signed());
+                let a_signed = _mm256_xor_si256(a.into(), sign_bit);
+                let b_signed = _mm256_xor_si256(b.into(), sign_bit);
+                _mm256_cmpgt_epi32(a_signed, b_signed).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
-        crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
-        crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
-        crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0)
-    }
-    #[inline(always)]
-    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
-    }
-    #[inline(always)]
-    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        if SHIFT >= 4usize {
-            return b;
-        }
-        let result = cross_block_alignr_256x1(
-            self,
-            self.cvt_to_bytes_f64x4(b).val.0,
-            self.cvt_to_bytes_f64x4(a).val.0,
-            SHIFT * 8usize,
-        );
-        self.cvt_from_bytes_f64x4(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn slide_within_blocks_f64x4<const SHIFT: usize>(
-        self,
-        a: f64x4<Self>,
-        b: f64x4<Self>,
-    ) -> f64x4<Self> {
-        if SHIFT >= 2usize {
-            return b;
-        }
-        let result = dyn_alignr_256(
-            self,
-            self.cvt_to_bytes_f64x4(b).val.0,
-            self.cvt_to_bytes_f64x4(a).val.0,
-            SHIFT * 8usize,
-        );
-        self.cvt_from_bytes_f64x4(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                let lo = _mm256_unpacklo_epi32(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi32(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                let lo = _mm256_unpacklo_epi32(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi32(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_sqrt_pd(a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                let t1 = _mm256_permutevar8x32_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
+                );
+                let t2 = _mm256_permutevar8x32_epi32(
+                    b.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
+                );
+                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token)
             }
         );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        1.0 / a
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_add_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                let t1 = _mm256_permutevar8x32_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
+                );
+                let t2 = _mm256_permutevar8x32_epi32(
+                    b.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
+                );
+                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_sub_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> (u32x8<Avx2>, u32x8<Avx2>) {
+                let lo = _mm256_unpacklo_epi32(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi32(a.into(), b.into());
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token),
+                )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_mul_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> (u32x8<Avx2>, u32x8<Avx2>) {
+                let t1 = _mm256_permutevar8x32_epi32(
+                    a.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
+                );
+                let t2 = _mm256_permutevar8x32_epi32(
+                    b.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7),
+                );
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token),
+                )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_div_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(
+                token: Avx2,
+                a: mask32x8<Avx2>,
+                b: u32x8<Avx2>,
+                c: u32x8<Avx2>,
+            ) -> u32x8<Avx2> {
+                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                let mask = _mm256_set1_pd(-0.0);
-                _mm256_or_pd(
-                    _mm256_and_pd(mask, b.into()),
-                    _mm256_andnot_pd(mask, a.into()),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_min_epu32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>, b: u32x8<Avx2>) -> u32x8<Avx2> {
+                _mm256_max_epu32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
+        u32x16 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>) -> (u32x4<Avx2>, u32x4<Avx2>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>) -> u8x32<Avx2> {
+                __m256i::from(a).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Avx2, a: u32x8<Avx2>) -> f32x8<Avx2> {
+                let a = a.into();
+                let lo = _mm256_blend_epi16::<0xAA>(a, _mm256_set1_epi32(0x4B000000));
+                let hi = _mm256_blend_epi16::<0xAA>(
+                    _mm256_srli_epi32::<16>(a),
+                    _mm256_set1_epi32(0x53000000),
+                );
+                let fhi = _mm256_sub_ps(
+                    _mm256_castsi256_ps(hi),
+                    _mm256_set1_ps(f32::from_bits(0x53000080)),
+                );
+                let result = _mm256_add_ps(_mm256_castsi256_ps(lo), fhi);
+                result.simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                let lo = _mm256_unpacklo_pd(a.into(), b.into());
-                let hi = _mm256_unpackhi_pd(a.into(), b.into());
-                _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token)
+            fn kernel(token: Avx2, val: bool) -> mask32x8<Avx2> {
+                let val: i32 = if val { !0 } else { 0 };
+                _mm256_set1_epi32(val).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, val)
     }
     #[inline(always)]
-    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                let lo = _mm256_unpacklo_pd(a.into(), b.into());
-                let hi = _mm256_unpackhi_pd(a.into(), b.into());
-                _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
+        mask32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
-                let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
-                _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
-                let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
-                _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token)
+            fn kernel(token: Avx2, bits: u64) -> mask32x8<Avx2> {
+                {
+                    let bit_lanes = _mm256_set1_epi32(bits as i32);
+                    let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
+                    _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                }
+                .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, bits)
     }
     #[inline(always)]
-    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> (f64x4<Avx2>, f64x4<Avx2>) {
-                let lo = _mm256_unpacklo_pd(a.into(), b.into());
-                let hi = _mm256_unpackhi_pd(a.into(), b.into());
-                (
-                    _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token),
-                    _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> u64 {
+                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> (f64x4<Avx2>, f64x4<Avx2>) {
-                let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
-                let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
-                (
-                    _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token),
-                    _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token),
-                )
-            }
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
         );
-        kernel(self, a, b)
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
     }
     #[inline(always)]
-    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_max_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_min_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                let intermediate = _mm256_max_pd(a.into(), b.into());
-                let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
-                _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
-                let intermediate = _mm256_min_pd(a.into(), b.into());
-                let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
-                _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
+        self.xor_mask32x8(a, self.splat_mask32x8(true))
     }
     #[inline(always)]
-    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+    fn select_mask32x8(
+        self,
+        a: mask32x8<Self>,
+        b: mask32x8<Self>,
+        c: mask32x8<Self>,
+    ) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>, c: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(
+                token: Avx2,
+                a: mask32x8<Avx2>,
+                b: mask32x8<Avx2>,
+                c: mask32x8<Avx2>,
+            ) -> mask32x8<Avx2> {
+                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>, c: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>, b: mask32x8<Avx2>) -> mask32x8<Avx2> {
+                _mm256_cmpeq_epi32(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
+                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
+                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0b11111111
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
+                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 != 0b11111111
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        a - self.trunc_f64x4(a)
-    }
-    #[inline(always)]
-    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> bool {
+                _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 == 0
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask64x4<Avx2>,
-                b: f64x4<Avx2>,
-                c: f64x4<Avx2>,
-            ) -> f64x4<Avx2> {
-                _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(token)
-            }
-        );
-        kernel(self, a, b, c)
-    }
-    #[inline(always)]
-    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
+        mask32x16 {
             val: crate::support::Aligned512([a.val.0, b.val.0]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> (f64x2<Avx2>, f64x2<Avx2>) {
+            fn kernel(token: Avx2, a: mask32x8<Avx2>) -> (mask32x4<Avx2>, mask32x4<Avx2>) {
                 (
-                    _mm256_extractf128_pd::<0>(a.into()).simd_into(token),
-                    _mm256_extractf128_pd::<1>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
                 )
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f32x8<Avx2> {
-                _mm256_castpd_ps(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, val: bool) -> mask64x4<Avx2> {
-                let val: i64 = if val { !0 } else { 0 };
-                _mm256_set1_epi64x(val).simd_into(token)
+            fn kernel(token: Avx2, val: f64) -> f64x4<Avx2> {
+                _mm256_set1_pd(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
-        mask64x4 {
+    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0)
+    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, bits: u64) -> mask64x4<Avx2> {
-                {
-                    let bit_lanes = _mm256_set1_epi64x(bits.cast_signed());
-                    let bit_mask = _mm256_set_epi64x(8, 4, 2, 1);
-                    _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
-                }
-                .simd_into(token)
-            }
-        );
-        kernel(self, bits)
+    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> u64 {
-                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64
-            }
-        );
-        kernel(self, a)
+    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
+        crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 4usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4usize
-        );
-        let mut lanes = self.as_array_mask64x4(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x4(lanes);
+    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
+        crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            self.cvt_to_bytes_f64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f64x4<const SHIFT: usize>(
+        self,
+        a: f64x4<Self>,
+        b: f64x4<Self>,
+    ) -> f64x4<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            self.cvt_to_bytes_f64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
-        self.xor_mask64x4(a, self.splat_mask64x4(true))
+    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_sqrt_pd(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn select_mask64x4(
-        self,
-        a: mask64x4<Self>,
-        b: mask64x4<Self>,
-        c: mask64x4<Self>,
-    ) -> mask64x4<Self> {
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        1.0 / a
+    }
+    #[inline(always)]
+    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx2,
-                a: mask64x4<Avx2>,
-                b: mask64x4<Avx2>,
-                c: mask64x4<Avx2>,
-            ) -> mask64x4<Avx2> {
-                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_add_pd(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
-                _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_sub_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
-                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_mul_pd(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
-                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_div_pd(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
-                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                let mask = _mm256_set1_pd(-0.0);
+                _mm256_or_pd(
+                    _mm256_and_pd(mask, b.into()),
+                    _mm256_andnot_pd(mask, a.into()),
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
-                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_castpd_si256(_mm256_cmp_pd::<0i32>(a.into(), b.into())).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
-        mask64x8 {
-            val: crate::support::Aligned512([a.val.0, b.val.0]),
-            simd: self,
-        }
+    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_castpd_si256(_mm256_cmp_pd::<17i32>(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> (mask64x2<Avx2>, mask64x2<Avx2>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_castpd_si256(_mm256_cmp_pd::<18i32>(a.into(), b.into())).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
-        let half = self.splat_f32x8(val);
-        self.combine_f32x8(half, half)
+    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_castpd_si256(_mm256_cmp_pd::<29i32>(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_castpd_si256(_mm256_cmp_pd::<30i32>(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                let lo = _mm256_unpacklo_pd(a.into(), b.into());
+                let hi = _mm256_unpackhi_pd(a.into(), b.into());
+                _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m256; 2usize], [f32; 16usize]>(&a.val.0)
+    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                let lo = _mm256_unpacklo_pd(a.into(), b.into());
+                let hi = _mm256_unpackhi_pd(a.into(), b.into());
+                _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m256; 2usize], [f32; 16usize]>(&a.val.0)
+    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
+                _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m256; 2usize], [f32; 16usize]>(&mut a.val.0)
+    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
+                _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> (f64x4<Avx2>, f64x4<Avx2>) {
+                let lo = _mm256_unpacklo_pd(a.into(), b.into());
+                let hi = _mm256_unpackhi_pd(a.into(), b.into());
+                (
+                    _mm256_permute2f128_pd::<0b0010_0000>(lo, hi).simd_into(token),
+                    _mm256_permute2f128_pd::<0b0011_0001>(lo, hi).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
-        f32x16 {
+    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> (f64x4<Avx2>, f64x4<Avx2>) {
+                let t1 = _mm256_permute4x64_pd::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_pd::<0b11_01_10_00>(b.into());
+                (
+                    _mm256_permute2f128_pd::<0b0010_0000>(t1, t2).simd_into(token),
+                    _mm256_permute2f128_pd::<0b0011_0001>(t1, t2).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_max_pd(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_min_pd(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                let intermediate = _mm256_max_pd(a.into(), b.into());
+                let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
+                _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>) -> f64x4<Avx2> {
+                let intermediate = _mm256_min_pd(a.into(), b.into());
+                let b_is_nan = _mm256_cmp_pd::<3i32>(b.into(), b.into());
+                _mm256_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>, c: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>, b: f64x4<Avx2>, c: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        a - self.trunc_f64x4(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f64x4<Avx2> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx2,
+                a: mask64x4<Avx2>,
+                b: f64x4<Avx2>,
+                c: f64x4<Avx2>,
+            ) -> f64x4<Avx2> {
+                _mm256_blendv_pd(c.into(), b.into(), _mm256_castsi256_pd(a.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
+        f64x8 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> (f64x2<Avx2>, f64x2<Avx2>) {
+                (
+                    _mm256_extractf128_pd::<0>(a.into()).simd_into(token),
+                    _mm256_extractf128_pd::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f64x4<Avx2>) -> f32x8<Avx2> {
+                _mm256_castpd_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_i64x4(self, val: i64) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: i64) -> i64x4<Avx2> {
+                _mm256_set1_epi64x(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x4(self, a: i64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i64x4(self, a: &i64x4<Self>) -> &[i64; 4usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i64x4(self, a: &mut i64x4<Self>) -> &mut [i64; 4usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i64; 4usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i64x4(self, a: i64x4<Self>, dest: &mut [i64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x4(self, a: u8x32<Self>) -> i64x4<Self> {
+        i64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn cvt_to_bytes_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_i64x4<const SHIFT: usize>(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        if SHIFT >= 4usize {
             return b;
         }
-        let result = cross_block_alignr_256x2(
+        let result = cross_block_alignr_256x1(
             self,
-            self.cvt_to_bytes_f32x16(b).val.0,
-            self.cvt_to_bytes_f32x16(a).val.0,
-            SHIFT * 4usize,
+            self.cvt_to_bytes_i64x4(b).val.0,
+            self.cvt_to_bytes_i64x4(a).val.0,
+            SHIFT * 8usize,
         );
-        self.cvt_from_bytes_f32x16(u8x64 {
-            val: crate::support::Aligned512(result),
+        self.cvt_from_bytes_i64x4(u8x32 {
+            val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+    fn slide_within_blocks_i64x4<const SHIFT: usize>(
         self,
-        a: f32x16<Self>,
-        b: f32x16<Self>,
-    ) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
-        )
+        a: i64x4<Self>,
+        b: i64x4<Self>,
+    ) -> i64x4<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_i64x4(b).val.0,
+            self.cvt_to_bytes_i64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
+    fn add_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                _mm256_add_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
+    fn sub_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                _mm256_sub_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
+    fn mul_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let a: [i64; 4usize] = a.into();
+        let b: [i64; 4usize] = b.into();
+        let result: [i64; 4usize] = [
+            a[0usize].wrapping_mul(b[0usize]),
+            a[1usize].wrapping_mul(b[1usize]),
+            a[2usize].wrapping_mul(b[2usize]),
+            a[3usize].wrapping_mul(b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
+    fn and_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, shift: u32) -> i64x4<Avx2> {
+                _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                _mm256_sllv_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let a: [i64; 4usize] = a.into();
+        let result: [i64; 4usize] = [
+            core::ops::Shr::shr(a[0usize], shift),
+            core::ops::Shr::shr(a[1usize], shift),
+            core::ops::Shr::shr(a[2usize], shift),
+            core::ops::Shr::shr(a[3usize], shift),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn shrv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let a: [i64; 4usize] = a.into();
+        let b: [i64; 4usize] = b.into();
+        let result: [i64; 4usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [i64; 4usize] = a.into();
+                let b: [i64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] < b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] < b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] < b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] < b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [i64; 4usize] = a.into();
+                let b: [i64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] <= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] <= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] <= b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] <= b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [i64; 4usize] = a.into();
+                let b: [i64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] >= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] >= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] >= b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] >= b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [i64; 4usize] = a.into();
+                let b: [i64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] > b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] > b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] > b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] > b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                let lo = _mm256_unpacklo_epi64(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi64(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                let lo = _mm256_unpacklo_epi64(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi64(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into());
+                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> i64x4<Avx2> {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into());
+                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> (i64x4<Avx2>, i64x4<Avx2>) {
+                let lo = _mm256_unpacklo_epi64(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi64(a.into(), b.into());
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>, b: i64x4<Avx2>) -> (i64x4<Avx2>, i64x4<Avx2>) {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into());
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn select_i64x4(self, a: mask64x4<Self>, b: i64x4<Self>, c: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx2,
+                a: mask64x4<Avx2>,
+                b: i64x4<Avx2>,
+                c: i64x4<Avx2>,
+            ) -> i64x4<Avx2> {
+                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let a: [i64; 4usize] = a.into();
+        let b: [i64; 4usize] = b.into();
+        let result: [i64; 4usize] = [
+            a[0usize].min(b[0usize]),
+            a[1usize].min(b[1usize]),
+            a[2usize].min(b[2usize]),
+            a[3usize].min(b[3usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn max_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let a: [i64; 4usize] = a.into();
+        let b: [i64; 4usize] = b.into();
+        let result: [i64; 4usize] = [
+            a[0usize].max(b[0usize]),
+            a[1usize].max(b[1usize]),
+            a[2usize].max(b[2usize]),
+            a[3usize].max(b[3usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn combine_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x8<Self> {
+        i64x8 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_i64x4(self, a: i64x4<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>) -> (i64x2<Avx2>, i64x2<Avx2>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn neg_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>) -> i64x4<Avx2> {
+                _mm256_sub_epi64(_mm256_setzero_si256(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>) -> u8x32<Avx2> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x4(self, a: i64x4<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: i64x4<Avx2>) -> u32x8<Avx2> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u64x4(self, val: u64) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: u64) -> u64x4<Avx2> {
+                _mm256_set1_epi64x(val.cast_signed()).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u64x4(self, a: u64x4<Self>) -> [u64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u64x4(self, a: &u64x4<Self>) -> &[u64; 4usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u64x4(self, a: &mut u64x4<Self>) -> &mut [u64; 4usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u64; 4usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u64x4(self, a: u64x4<Self>, dest: &mut [u64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u64x4(self, a: u8x32<Self>) -> u64x4<Self> {
+        u64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u64x4<const SHIFT: usize>(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_u64x4(b).val.0,
+            self.cvt_to_bytes_u64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_u64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u64x4<const SHIFT: usize>(
+        self,
+        a: u64x4<Self>,
+        b: u64x4<Self>,
+    ) -> u64x4<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_u64x4(b).val.0,
+            self.cvt_to_bytes_u64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_u64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                _mm256_add_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn sub_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                _mm256_sub_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let a: [u64; 4usize] = a.into();
+        let b: [u64; 4usize] = b.into();
+        let result: [u64; 4usize] = [
+            a[0usize].wrapping_mul(b[0usize]),
+            a[1usize].wrapping_mul(b[1usize]),
+            a[2usize].wrapping_mul(b[2usize]),
+            a[3usize].wrapping_mul(b[3usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn and_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_u64x4(self, a: u64x4<Self>) -> u64x4<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, shift: u32) -> u64x4<Avx2> {
+                _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                _mm256_sllv_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, shift: u32) -> u64x4<Avx2> {
+                _mm256_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                _mm256_srlv_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_eq_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [u64; 4usize] = a.into();
+                let b: [u64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] < b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] < b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] < b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] < b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [u64; 4usize] = a.into();
+                let b: [u64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] <= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] <= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] <= b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] <= b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [u64; 4usize] = a.into();
+                let b: [u64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] >= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] >= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] >= b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] >= b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> mask64x4<Avx2> {
+                let a: [u64; 4usize] = a.into();
+                let b: [u64; 4usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 4usize] = [
+                    if a[0usize] > b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] > b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[2usize] > b[2usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[3usize] > b[3usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                let lo = _mm256_unpacklo_epi64(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi64(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                let lo = _mm256_unpacklo_epi64(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi64(a.into(), b.into());
+                _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into());
+                _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> u64x4<Avx2> {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into());
+                _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> (u64x4<Avx2>, u64x4<Avx2>) {
+                let lo = _mm256_unpacklo_epi64(a.into(), b.into());
+                let hi = _mm256_unpackhi_epi64(a.into(), b.into());
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(lo, hi).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(lo, hi).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>, b: u64x4<Avx2>) -> (u64x4<Avx2>, u64x4<Avx2>) {
+                let t1 = _mm256_permute4x64_epi64::<0b11_01_10_00>(a.into());
+                let t2 = _mm256_permute4x64_epi64::<0b11_01_10_00>(b.into());
+                (
+                    _mm256_permute2x128_si256::<0b0010_0000>(t1, t2).simd_into(token),
+                    _mm256_permute2x128_si256::<0b0011_0001>(t1, t2).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn select_u64x4(self, a: mask64x4<Self>, b: u64x4<Self>, c: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx2,
+                a: mask64x4<Avx2>,
+                b: u64x4<Avx2>,
+                c: u64x4<Avx2>,
+            ) -> u64x4<Avx2> {
+                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let a: [u64; 4usize] = a.into();
+        let b: [u64; 4usize] = b.into();
+        let result: [u64; 4usize] = [
+            a[0usize].min(b[0usize]),
+            a[1usize].min(b[1usize]),
+            a[2usize].min(b[2usize]),
+            a[3usize].min(b[3usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn max_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let a: [u64; 4usize] = a.into();
+        let b: [u64; 4usize] = b.into();
+        let result: [u64; 4usize] = [
+            a[0usize].max(b[0usize]),
+            a[1usize].max(b[1usize]),
+            a[2usize].max(b[2usize]),
+            a[3usize].max(b[3usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn combine_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x8<Self> {
+        u64x8 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_u64x4(self, a: u64x4<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>) -> (u64x2<Avx2>, u64x2<Avx2>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>) -> u8x32<Avx2> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x4(self, a: u64x4<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x4<Avx2>) -> u32x8<Avx2> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, val: bool) -> mask64x4<Avx2> {
+                let val: i64 = if val { !0 } else { 0 };
+                _mm256_set1_epi64x(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+        mask64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, bits: u64) -> mask64x4<Avx2> {
+                {
+                    let bit_lanes = _mm256_set1_epi64x(bits.cast_signed());
+                    let bit_mask = _mm256_set_epi64x(8, 4, 2, 1);
+                    _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                }
+                .simd_into(token)
+            }
+        );
+        kernel(self, bits)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> u64 {
+                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
+    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
+        self.xor_mask64x4(a, self.splat_mask64x4(true))
+    }
+    #[inline(always)]
+    fn select_mask64x4(
+        self,
+        a: mask64x4<Self>,
+        b: mask64x4<Self>,
+        c: mask64x4<Self>,
+    ) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx2,
+                a: mask64x4<Avx2>,
+                b: mask64x4<Avx2>,
+                c: mask64x4<Avx2>,
+            ) -> mask64x4<Avx2> {
+                _mm256_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>, b: mask64x4<Avx2>) -> mask64x4<Avx2> {
+                _mm256_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
+                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
+                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0b1111
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
+                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 != 0b1111
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> bool {
+                _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 == 0
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: crate::support::Aligned512([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: mask64x4<Avx2>) -> (mask64x2<Avx2>, mask64x2<Avx2>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
+        let half = self.splat_f32x8(val);
+        self.combine_f32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m256; 2usize], [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m256; 2usize], [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m256; 2usize], [f32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_f32x16(b).val.0,
+            self.cvt_to_bytes_f32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+        self,
+        a: f32x16<Self>,
+        b: f32x16<Self>,
+    ) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
+    }
+    #[inline(always)]
+    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
+    }
+    #[inline(always)]
+    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, _) = self.split_f32x16(a);
+        let (b0, _) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (_, a1) = self.split_f32x16(a);
+        let (_, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_lo = self.zip_low_f32x8(a0, b0);
+        let lo_hi = self.zip_high_f32x8(a0, b0);
+        let hi_lo = self.zip_low_f32x8(a1, b1);
+        let hi_hi = self.zip_high_f32x8(a1, b1);
+        (
+            self.combine_f32x8(lo_lo, lo_hi),
+            self.combine_f32x8(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_even = self.unzip_low_f32x8(a0, a1);
+        let lo_odd = self.unzip_high_f32x8(a0, a1);
+        let hi_even = self.unzip_low_f32x8(b0, b1);
+        let hi_odd = self.unzip_high_f32x8(b0, b1);
+        (
+            self.combine_f32x8(lo_even, hi_even),
+            self.combine_f32x8(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.max_precise_f32x8(a0, b0),
+            self.max_precise_f32x8(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
         self.combine_f32x8(
-            self.approximate_recip_f32x8(a0),
-            self.approximate_recip_f32x8(a1),
+            self.min_precise_f32x8(a0, b0),
+            self.min_precise_f32x8(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_add_f32x8(a0, b0, c0),
+            self.mul_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_sub_f32x8(a0, b0, c0),
+            self.mul_sub_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+    }
+    #[inline(always)]
+    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.round_ties_even_f32x8(a0),
+            self.round_ties_even_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
+    }
+    #[inline(always)]
+    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+    }
+    #[inline(always)]
+    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        (
+            f32x8 {
+                val: crate::support::Aligned256(a.val.0[0]),
+                simd: self,
+            },
+            f32x8 {
+                val: crate::support::Aligned256(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f64x4(
+            self.reinterpret_f64_f32x8(a0),
+            self.reinterpret_f64_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.reinterpret_i32_f32x8(a0),
+            self.reinterpret_i32_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, src: &[f32; 16usize]) -> f32x16<Avx2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
+                let v1: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
+                let v2: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
+                let v3: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                token.combine_f32x8(
+                    token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x16<Avx2>, dest: &mut [f32; 16usize]) -> () {
+                let (v01, v23) = token.split_f32x16(a);
+                let (v0, v1) = token.split_f32x8(v01);
+                let (v2, v3) = token.split_f32x8(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest);
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_f32x8(a0),
+            self.reinterpret_u32_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(
+            self.cvt_u32_precise_f32x8(a0),
+            self.cvt_u32_precise_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.cvt_i32_precise_f32x8(a0),
+            self.cvt_i32_precise_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+        let half = self.splat_i8x32(val);
+        self.combine_i8x32(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i8; 64usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        if SHIFT >= 64usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_i8x64(b).val.0,
+            self.cvt_to_bytes_i8x64(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+        self,
+        a: i8x64<Self>,
+        b: i8x64<Self>,
+    ) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(
+            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
+    }
+    #[inline(always)]
+    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, _) = self.split_i8x64(a);
+        let (b0, _) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (_, a1) = self.split_i8x64(a);
+        let (_, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_lo = self.zip_low_i8x32(a0, b0);
+        let lo_hi = self.zip_high_i8x32(a0, b0);
+        let hi_lo = self.zip_low_i8x32(a1, b1);
+        let hi_hi = self.zip_high_i8x32(a1, b1);
+        (
+            self.combine_i8x32(lo_lo, lo_hi),
+            self.combine_i8x32(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_even = self.unzip_low_i8x32(a0, a1);
+        let lo_odd = self.unzip_high_i8x32(a0, a1);
+        let hi_even = self.unzip_low_i8x32(b0, b1);
+        let hi_odd = self.unzip_high_i8x32(b0, b1);
+        (
+            self.combine_i8x32(lo_even, hi_even),
+            self.combine_i8x32(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let (c0, c1) = self.split_i8x64(c);
+        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        (
+            i8x32 {
+                val: crate::support::Aligned256(a.val.0[0]),
+                simd: self,
+            },
+            i8x32 {
+                val: crate::support::Aligned256(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i8x32(a0),
+            self.reinterpret_u32_i8x32(a1),
         )
     }
     #[inline(always)]
-    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
+    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
+        let half = self.splat_u8x32(val);
+        self.combine_u8x32(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u8; 64usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
+    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        if SHIFT >= 64usize {
+            return b;
+        }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_u8x64(b).val.0,
+            self.cvt_to_bytes_u8x64(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
+    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+        self,
+        a: u8x64<Self>,
+        b: u8x64<Self>,
+    ) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(
+            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
+        )
     }
     #[inline(always)]
-    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
+    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
+    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
+    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
+    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
+    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
+    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
+    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
     }
     #[inline(always)]
-    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, _) = self.split_f32x16(a);
-        let (b0, _) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
+    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
     }
     #[inline(always)]
-    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (_, a1) = self.split_f32x16(a);
-        let (_, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
+    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
+    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
     }
     #[inline(always)]
-    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
+    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_lo = self.zip_low_f32x8(a0, b0);
-        let lo_hi = self.zip_high_f32x8(a0, b0);
-        let hi_lo = self.zip_low_f32x8(a1, b1);
-        let hi_hi = self.zip_high_f32x8(a1, b1);
-        (
-            self.combine_f32x8(lo_lo, lo_hi),
-            self.combine_f32x8(hi_lo, hi_hi),
-        )
+    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_even = self.unzip_low_f32x8(a0, a1);
-        let lo_odd = self.unzip_high_f32x8(a0, a1);
-        let hi_even = self.unzip_low_f32x8(b0, b1);
-        let hi_odd = self.unzip_high_f32x8(b0, b1);
-        (
-            self.combine_f32x8(lo_even, hi_even),
-            self.combine_f32x8(lo_odd, hi_odd),
-        )
+    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
+    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
+    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.max_precise_f32x8(a0, b0),
-            self.max_precise_f32x8(a1, b1),
-        )
+    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.min_precise_f32x8(a0, b0),
-            self.min_precise_f32x8(a1, b1),
-        )
+    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, _) = self.split_u8x64(a);
+        let (b0, _) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
     }
     #[inline(always)]
-    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_add_f32x8(a0, b0, c0),
-            self.mul_add_f32x8(a1, b1, c1),
-        )
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (_, a1) = self.split_u8x64(a);
+        let (_, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_sub_f32x8(a0, b0, c0),
-            self.mul_sub_f32x8(a1, b1, c1),
-        )
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
     }
     #[inline(always)]
-    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
     }
     #[inline(always)]
-    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_lo = self.zip_low_u8x32(a0, b0);
+        let lo_hi = self.zip_high_u8x32(a0, b0);
+        let hi_lo = self.zip_low_u8x32(a1, b1);
+        let hi_hi = self.zip_high_u8x32(a1, b1);
+        (
+            self.combine_u8x32(lo_lo, lo_hi),
+            self.combine_u8x32(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.round_ties_even_f32x8(a0),
-            self.round_ties_even_f32x8(a1),
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_even = self.unzip_low_u8x32(a0, a1);
+        let lo_odd = self.unzip_high_u8x32(a0, a1);
+        let hi_even = self.unzip_low_u8x32(b0, b1);
+        let hi_odd = self.unzip_high_u8x32(b0, b1);
+        (
+            self.combine_u8x32(lo_even, hi_even),
+            self.combine_u8x32(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
+    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let (c0, c1) = self.split_u8x64(c);
+        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
     }
     #[inline(always)]
-    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
+    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
         (
-            f32x8 {
+            u8x32 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            f32x8 {
+            u8x32 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f64x4(
-            self.reinterpret_f64_f32x8(a0),
-            self.reinterpret_f64_f32x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.reinterpret_i32_f32x8(a0),
-            self.reinterpret_i32_f32x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, src: &[f32; 16usize]) -> f32x16<Avx2> {
-                let (chunks, []) = src.as_chunks::<4usize>() else {
+            fn kernel(token: Avx2, src: &[u8; 64usize]) -> u8x64<Avx2> {
+                let (chunks, []) = src.as_chunks::<16usize>() else {
                     unreachable!()
                 };
-                let v0: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
-                let v1: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
-                let v2: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
-                let v3: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
-                let tmp0 = _mm_unpacklo_ps(v0, v1);
-                let tmp1 = _mm_unpackhi_ps(v0, v1);
-                let tmp2 = _mm_unpacklo_ps(v2, v3);
-                let tmp3 = _mm_unpackhi_ps(v2, v3);
-                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                token.combine_f32x8(
-                    token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)),
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                let v0 = _mm_shuffle_epi8(v0, mask);
+                let v1 = _mm_shuffle_epi8(v1, mask);
+                let v2 = _mm_shuffle_epi8(v2, mask);
+                let v3 = _mm_shuffle_epi8(v3, mask);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u8x32(
+                    token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)),
                 )
             }
         );
         kernel(self, src)
     }
     #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: f32x16<Avx2>, dest: &mut [f32; 16usize]) -> () {
-                let (v01, v23) = token.split_f32x16(a);
-                let (v0, v1) = token.split_f32x8(v01);
-                let (v2, v3) = token.split_f32x8(v23);
+            fn kernel(token: Avx2, a: u8x64<Avx2>, dest: &mut [u8; 64usize]) -> () {
+                let (v01, v23) = token.split_u8x64(a);
+                let (v0, v1) = token.split_u8x32(v01);
+                let (v2, v3) = token.split_u8x32(v23);
                 let v0 = v0.into();
                 let v1 = v1.into();
                 let v2 = v2.into();
                 let v3 = v3.into();
-                let tmp0 = _mm_unpacklo_ps(v0, v1);
-                let tmp1 = _mm_unpackhi_ps(v0, v1);
-                let tmp2 = _mm_unpacklo_ps(v2, v3);
-                let tmp3 = _mm_unpackhi_ps(v2, v3);
-                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                let out0 = _mm_shuffle_epi8(out0, mask);
+                let out1 = _mm_shuffle_epi8(out1, mask);
+                let out2 = _mm_shuffle_epi8(out2, mask);
+                let out3 = _mm_shuffle_epi8(out3, mask);
+                let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
                     unreachable!()
                 };
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out0,
                     &mut chunks[0],
                 );
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out1,
                     &mut chunks[1],
                 );
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out2,
                     &mut chunks[2],
                 );
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out3,
                     &mut chunks[3],
                 );
@@ -8742,585 +11592,721 @@ impl Simd for Avx2 {
         kernel(self, a, dest);
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
+    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u8x32(a0),
+            self.reinterpret_u32_u8x32(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
+        let half = self.splat_mask8x32(val);
+        self.combine_mask8x32(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
+        mask8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, bits: u64) -> mask8x64<Avx2> {
+                {
+                    let bit_bytes = _mm256_set1_epi64x(bits.cast_signed());
+                    let bit_mask = _mm256_setr_epi8(
+                        1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16,
+                        32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128,
+                    );
+                    mask8x64 {
+                        val: crate::support::Aligned512([
+                            {
+                                let bit_bytes = _mm256_shuffle_epi8(
+                                    bit_bytes,
+                                    _mm256_setr_epi8(
+                                        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
+                                        2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+                                    ),
+                                );
+                                _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+                            },
+                            {
+                                let bit_bytes = _mm256_shuffle_epi8(
+                                    bit_bytes,
+                                    _mm256_setr_epi8(
+                                        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,
+                                        6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+                                    ),
+                                );
+                                _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+                            },
+                        ]),
+                        simd: token,
+                    }
+                }
+            }
+        );
+        kernel(self, bits)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
+    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
+    }
+    #[inline(always)]
+    fn select_mask8x64(
+        self,
+        a: mask8x64<Self>,
+        b: mask8x64<Self>,
+        c: mask8x64<Self>,
+    ) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        let (c0, c1) = self.split_mask8x64(c);
+        self.combine_mask8x32(
+            self.select_mask8x32(a0, b0, c0),
+            self.select_mask8x32(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_f32x8(a0),
-            self.reinterpret_u32_f32x8(a1),
-        )
+    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
+    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.cvt_u32_precise_f32x8(a0),
-            self.cvt_u32_precise_f32x8(a1),
-        )
+    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
+    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.cvt_i32_precise_f32x8(a0),
-            self.cvt_i32_precise_f32x8(a1),
+    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+        (
+            mask8x32 {
+                val: crate::support::Aligned256(a.val.0[0]),
+                simd: self,
+            },
+            mask8x32 {
+                val: crate::support::Aligned256(a.val.0[1]),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
-        let half = self.splat_i8x32(val);
-        self.combine_i8x32(half, half)
+    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
+        let half = self.splat_i16x16(val);
+        self.combine_i16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
-        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
-        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
-        i8x64 {
+    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        if SHIFT >= 64usize {
+    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        if SHIFT >= 32usize {
             return b;
         }
         let result = cross_block_alignr_256x2(
             self,
-            self.cvt_to_bytes_i8x64(b).val.0,
-            self.cvt_to_bytes_i8x64(a).val.0,
-            SHIFT,
+            self.cvt_to_bytes_i16x32(b).val.0,
+            self.cvt_to_bytes_i16x32(a).val.0,
+            SHIFT * 2usize,
         );
-        self.cvt_from_bytes_i8x64(u8x64 {
+        self.cvt_from_bytes_i16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+    fn slide_within_blocks_i16x32<const SHIFT: usize>(
         self,
-        a: i8x64<Self>,
-        b: i8x64<Self>,
-    ) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(
-            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
+        a: i16x32<Self>,
+        b: i16x32<Self>,
+    ) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
+    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
+    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
+    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
+    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
+    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
+    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
+    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
     }
     #[inline(always)]
-    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
+    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
     }
     #[inline(always)]
-    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
+    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
+    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
     }
     #[inline(always)]
-    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
+    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
+    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
+    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
+    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
+    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, _) = self.split_i8x64(a);
-        let (b0, _) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (_, a1) = self.split_i8x64(a);
-        let (_, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, _) = self.split_i16x32(a);
+        let (b0, _) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
     }
     #[inline(always)]
-    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (_, a1) = self.split_i16x32(a);
+        let (_, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_lo = self.zip_low_i8x32(a0, b0);
-        let lo_hi = self.zip_high_i8x32(a0, b0);
-        let hi_lo = self.zip_low_i8x32(a1, b1);
-        let hi_hi = self.zip_high_i8x32(a1, b1);
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.unzip_high_i16x16(a0, a1),
+            self.unzip_high_i16x16(b0, b1),
+        )
+    }
+    #[inline(always)]
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_lo = self.zip_low_i16x16(a0, b0);
+        let lo_hi = self.zip_high_i16x16(a0, b0);
+        let hi_lo = self.zip_low_i16x16(a1, b1);
+        let hi_hi = self.zip_high_i16x16(a1, b1);
         (
-            self.combine_i8x32(lo_lo, lo_hi),
-            self.combine_i8x32(hi_lo, hi_hi),
+            self.combine_i16x16(lo_lo, lo_hi),
+            self.combine_i16x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_even = self.unzip_low_i8x32(a0, a1);
-        let lo_odd = self.unzip_high_i8x32(a0, a1);
-        let hi_even = self.unzip_low_i8x32(b0, b1);
-        let hi_odd = self.unzip_high_i8x32(b0, b1);
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_even = self.unzip_low_i16x16(a0, a1);
+        let lo_odd = self.unzip_high_i16x16(a0, a1);
+        let hi_even = self.unzip_low_i16x16(b0, b1);
+        let hi_odd = self.unzip_high_i16x16(b0, b1);
         (
-            self.combine_i8x32(lo_even, hi_even),
-            self.combine_i8x32(lo_odd, hi_odd),
+            self.combine_i16x16(lo_even, hi_even),
+            self.combine_i16x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let (c0, c1) = self.split_i8x64(c);
-        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
+    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let (c0, c1) = self.split_i16x32(c);
+        self.combine_i16x16(
+            self.select_i16x16(a0, b0, c0),
+            self.select_i16x16(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
+    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
+    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
         (
-            i8x32 {
+            i16x16 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            i8x32 {
+            i16x16 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
+    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
+    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_i16x16(a0),
+            self.reinterpret_u8_i16x16(a1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i8x64(a);
+    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i16x32(a);
         self.combine_u32x8(
-            self.reinterpret_u32_i8x32(a0),
-            self.reinterpret_u32_i8x32(a1),
+            self.reinterpret_u32_i16x16(a0),
+            self.reinterpret_u32_i16x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
-        let half = self.splat_u8x32(val);
-        self.combine_u8x32(half, half)
+    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
+        let half = self.splat_u16x16(val);
+        self.combine_u16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0)
+    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
-        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u8; 64usize]>(&a.val.0)
+    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
-        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        if SHIFT >= 64usize {
+    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        if SHIFT >= 32usize {
             return b;
         }
         let result = cross_block_alignr_256x2(
             self,
-            self.cvt_to_bytes_u8x64(b).val.0,
-            self.cvt_to_bytes_u8x64(a).val.0,
-            SHIFT,
+            self.cvt_to_bytes_u16x32(b).val.0,
+            self.cvt_to_bytes_u16x32(a).val.0,
+            SHIFT * 2usize,
         );
-        self.cvt_from_bytes_u8x64(u8x64 {
+        self.cvt_from_bytes_u16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+    fn slide_within_blocks_u16x32<const SHIFT: usize>(
         self,
-        a: u8x64<Self>,
-        b: u8x64<Self>,
-    ) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(
-            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
+        a: u16x32<Self>,
+        b: u16x32<Self>,
+    ) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
+    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
+    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
+    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
+    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
+    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
+    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
+    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
     }
     #[inline(always)]
-    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
+    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
+    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
+    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
+    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
+    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
+    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
+    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
+    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
+    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, _) = self.split_u8x64(a);
-        let (b0, _) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
+    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, _) = self.split_u16x32(a);
+        let (b0, _) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (_, a1) = self.split_u8x64(a);
-        let (_, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (_, a1) = self.split_u16x32(a);
+        let (_, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.unzip_high_u16x16(a0, a1),
+            self.unzip_high_u16x16(b0, b1),
+        )
     }
     #[inline(always)]
-    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_lo = self.zip_low_u8x32(a0, b0);
-        let lo_hi = self.zip_high_u8x32(a0, b0);
-        let hi_lo = self.zip_low_u8x32(a1, b1);
-        let hi_hi = self.zip_high_u8x32(a1, b1);
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_lo = self.zip_low_u16x16(a0, b0);
+        let lo_hi = self.zip_high_u16x16(a0, b0);
+        let hi_lo = self.zip_low_u16x16(a1, b1);
+        let hi_hi = self.zip_high_u16x16(a1, b1);
         (
-            self.combine_u8x32(lo_lo, lo_hi),
-            self.combine_u8x32(hi_lo, hi_hi),
+            self.combine_u16x16(lo_lo, lo_hi),
+            self.combine_u16x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_even = self.unzip_low_u8x32(a0, a1);
-        let lo_odd = self.unzip_high_u8x32(a0, a1);
-        let hi_even = self.unzip_low_u8x32(b0, b1);
-        let hi_odd = self.unzip_high_u8x32(b0, b1);
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_even = self.unzip_low_u16x16(a0, a1);
+        let lo_odd = self.unzip_high_u16x16(a0, a1);
+        let hi_even = self.unzip_low_u16x16(b0, b1);
+        let hi_odd = self.unzip_high_u16x16(b0, b1);
         (
-            self.combine_u8x32(lo_even, hi_even),
-            self.combine_u8x32(lo_odd, hi_odd),
+            self.combine_u16x16(lo_even, hi_even),
+            self.combine_u16x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let (c0, c1) = self.split_u8x64(c);
-        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
+    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let (c0, c1) = self.split_u16x32(c);
+        self.combine_u16x16(
+            self.select_u16x16(a0, b0, c0),
+            self.select_u16x16(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
+    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
+    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
         (
-            u8x32 {
+            u16x16 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            u8x32 {
+            u16x16 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
+    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, src: &[u8; 64usize]) -> u8x64<Avx2> {
-                let (chunks, []) = src.as_chunks::<16usize>() else {
+            fn kernel(token: Avx2, src: &[u16; 32usize]) -> u16x32<Avx2> {
+                let (chunks, []) = src.as_chunks::<8usize>() else {
                     unreachable!()
                 };
                 let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
                 let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
                 let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
                 let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
-                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
                 let v0 = _mm_shuffle_epi8(v0, mask);
                 let v1 = _mm_shuffle_epi8(v1, mask);
                 let v2 = _mm_shuffle_epi8(v2, mask);
@@ -9333,22 +12319,22 @@ impl Simd for Avx2 {
                 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
                 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
                 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                token.combine_u8x32(
-                    token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)),
+                token.combine_u16x16(
+                    token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)),
                 )
             }
         );
         kernel(self, src)
     }
     #[inline(always)]
-    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u8x64<Avx2>, dest: &mut [u8; 64usize]) -> () {
-                let (v01, v23) = token.split_u8x64(a);
-                let (v0, v1) = token.split_u8x32(v01);
-                let (v2, v3) = token.split_u8x32(v23);
+            fn kernel(token: Avx2, a: u16x32<Avx2>, dest: &mut [u16; 32usize]) -> () {
+                let (v01, v23) = token.split_u16x32(a);
+                let (v0, v1) = token.split_u16x16(v01);
+                let (v2, v3) = token.split_u16x16(v23);
                 let v0 = v0.into();
                 let v1 = v1.into();
                 let v2 = v2.into();
@@ -9361,27 +12347,27 @@ impl Simd for Avx2 {
                 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
                 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
                 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
                 let out0 = _mm_shuffle_epi8(out0, mask);
                 let out1 = _mm_shuffle_epi8(out1, mask);
                 let out2 = _mm_shuffle_epi8(out2, mask);
                 let out3 = _mm_shuffle_epi8(out3, mask);
-                let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+                let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
                     unreachable!()
                 };
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
                     out0,
                     &mut chunks[0],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
                     out1,
                     &mut chunks[1],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
                     out2,
                     &mut chunks[2],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
                     out3,
                     &mut chunks[3],
                 );
@@ -9390,725 +12376,712 @@ impl Simd for Avx2 {
         kernel(self, a, dest);
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u8x64(a);
+    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u16x32<Avx2>) -> u8x32<Avx2> {
+                let (a, b) = token.split_u16x32(a);
+                let mask = _mm256_set1_epi16(0xFF);
+                let lo_masked = _mm256_and_si256(a.into(), mask);
+                let hi_masked = _mm256_and_si256(b.into(), mask);
+                let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16(
+                    lo_masked, hi_masked,
+                ));
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_u16x16(a0),
+            self.reinterpret_u8_u16x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u16x32(a);
         self.combine_u32x8(
-            self.reinterpret_u32_u8x32(a0),
-            self.reinterpret_u32_u8x32(a1),
+            self.reinterpret_u32_u16x16(a0),
+            self.reinterpret_u32_u16x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
-        let half = self.splat_mask8x32(val);
-        self.combine_mask8x32(half, half)
+    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
+        let half = self.splat_mask16x16(val);
+        self.combine_mask16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
-        mask8x64 {
+    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
+        mask16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, bits: u64) -> mask8x64<Avx2> {
+            fn kernel(token: Avx2, a: mask16x32<Avx2>) -> u64 {
                 {
-                    let bit_bytes = _mm256_set1_epi64x(bits.cast_signed());
-                    let bit_mask = _mm256_setr_epi8(
-                        1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16,
-                        32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128,
-                    );
-                    mask8x64 {
-                        val: crate::support::Aligned512([
-                            {
-                                let bit_bytes = _mm256_shuffle_epi8(
-                                    bit_bytes,
-                                    _mm256_setr_epi8(
-                                        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
-                                        2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-                                    ),
-                                );
-                                _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
-                            },
-                            {
-                                let bit_bytes = _mm256_shuffle_epi8(
-                                    bit_bytes,
-                                    _mm256_setr_epi8(
-                                        4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,
-                                        6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-                                    ),
-                                );
-                                _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
-                            },
-                        ]),
-                        simd: token,
-                    }
+                    let lo = _mm256_movemask_epi8(a.val.0[0]) as u32;
+                    let hi = _mm256_movemask_epi8(a.val.0[1]) as u32;
+                    let lo = _pext_u32(lo, 0x5555_5555u32) as u64;
+                    let hi = _pext_u32(hi, 0x5555_5555u32) as u64;
+                    lo | (hi << 16usize)
                 }
             }
         );
-        kernel(self, bits)
-    }
-    #[inline(always)]
-    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
-        let (lo, hi) = self.split_mask8x64(a);
-        let lo = self.to_bitmask_mask8x32(lo);
-        let hi = self.to_bitmask_mask8x32(hi);
-        lo | (hi << 32usize)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 64usize,
+            index < 32usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            64usize
+            32usize
         );
-        let mut lanes = self.as_array_mask8x64(*a);
+        let mut lanes = self.as_array_mask16x32(*a);
         lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x64(lanes);
+        *a = self.load_array_mask16x32(lanes);
     }
     #[inline(always)]
-    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
+    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
+    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
+    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
+    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
     }
     #[inline(always)]
-    fn select_mask8x64(
+    fn select_mask16x32(
         self,
-        a: mask8x64<Self>,
-        b: mask8x64<Self>,
-        c: mask8x64<Self>,
-    ) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        let (c0, c1) = self.split_mask8x64(c);
-        self.combine_mask8x32(
-            self.select_mask8x32(a0, b0, c0),
-            self.select_mask8x32(a1, b1, c1),
+        a: mask16x32<Self>,
+        b: mask16x32<Self>,
+        c: mask16x32<Self>,
+    ) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        let (c0, c1) = self.split_mask16x32(c);
+        self.combine_mask16x16(
+            self.select_mask16x16(a0, b0, c0),
+            self.select_mask16x16(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
+    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(
+            self.simd_eq_mask16x16(a0, b0),
+            self.simd_eq_mask16x16(a1, b1),
+        )
     }
     #[inline(always)]
-    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
+    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
     }
     #[inline(always)]
-    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
+    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
     }
     #[inline(always)]
-    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
+    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
     }
     #[inline(always)]
-    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
+    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
     }
     #[inline(always)]
-    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
         (
-            mask8x32 {
+            mask16x16 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            mask8x32 {
+            mask16x16 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
-        let half = self.splat_i16x16(val);
-        self.combine_i16x16(half, half)
+    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
+        let half = self.splat_i32x8(val);
+        self.combine_i32x8(half, half)
     }
     #[inline(always)]
-    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0)
+    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
-        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0)
+    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
-        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_alignr_256x2(
             self,
-            self.cvt_to_bytes_i16x32(b).val.0,
-            self.cvt_to_bytes_i16x32(a).val.0,
-            SHIFT * 2usize,
+            self.cvt_to_bytes_i32x16(b).val.0,
+            self.cvt_to_bytes_i32x16(a).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_i16x32(u8x64 {
+        self.cvt_from_bytes_i32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+    fn slide_within_blocks_i32x16<const SHIFT: usize>(
         self,
-        a: i16x32<Self>,
-        b: i16x32<Self>,
-    ) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
+        a: i32x16<Self>,
+        b: i32x16<Self>,
+    ) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(
+            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
+    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
+    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
+    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
+    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
+    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
+    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
+    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
     }
     #[inline(always)]
-    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
+    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
+    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
+    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
+    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
+    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
+    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
+    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
+    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
+    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, _) = self.split_i16x32(a);
-        let (b0, _) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
+    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, _) = self.split_i32x16(a);
+        let (b0, _) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (_, a1) = self.split_i16x32(a);
-        let (_, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
+    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (_, a1) = self.split_i32x16(a);
+        let (_, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
+    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.unzip_high_i16x16(a0, a1),
-            self.unzip_high_i16x16(b0, b1),
-        )
+    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_lo = self.zip_low_i16x16(a0, b0);
-        let lo_hi = self.zip_high_i16x16(a0, b0);
-        let hi_lo = self.zip_low_i16x16(a1, b1);
-        let hi_hi = self.zip_high_i16x16(a1, b1);
+    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_lo = self.zip_low_i32x8(a0, b0);
+        let lo_hi = self.zip_high_i32x8(a0, b0);
+        let hi_lo = self.zip_low_i32x8(a1, b1);
+        let hi_hi = self.zip_high_i32x8(a1, b1);
         (
-            self.combine_i16x16(lo_lo, lo_hi),
-            self.combine_i16x16(hi_lo, hi_hi),
+            self.combine_i32x8(lo_lo, lo_hi),
+            self.combine_i32x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_even = self.unzip_low_i16x16(a0, a1);
-        let lo_odd = self.unzip_high_i16x16(a0, a1);
-        let hi_even = self.unzip_low_i16x16(b0, b1);
-        let hi_odd = self.unzip_high_i16x16(b0, b1);
+    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_even = self.unzip_low_i32x8(a0, a1);
+        let lo_odd = self.unzip_high_i32x8(a0, a1);
+        let hi_even = self.unzip_low_i32x8(b0, b1);
+        let hi_odd = self.unzip_high_i32x8(b0, b1);
         (
-            self.combine_i16x16(lo_even, hi_even),
-            self.combine_i16x16(lo_odd, hi_odd),
+            self.combine_i32x8(lo_even, hi_even),
+            self.combine_i32x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let (c0, c1) = self.split_i16x32(c);
-        self.combine_i16x16(
-            self.select_i16x16(a0, b0, c0),
-            self.select_i16x16(a1, b1, c1),
-        )
+    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let (c0, c1) = self.split_i32x16(c);
+        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
+    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
+    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
         (
-            i16x16 {
+            i32x8 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            i16x16 {
+            i32x8 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
+    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_i16x16(a0),
-            self.reinterpret_u8_i16x16(a1),
-        )
+    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i16x32(a);
+    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
         self.combine_u32x8(
-            self.reinterpret_u32_i16x16(a0),
-            self.reinterpret_u32_i16x16(a1),
+            self.reinterpret_u32_i32x8(a0),
+            self.reinterpret_u32_i32x8(a1),
         )
     }
     #[inline(always)]
-    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
-        let half = self.splat_u16x16(val);
-        self.combine_u16x16(half, half)
+    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
     }
     #[inline(always)]
-    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+        let half = self.splat_u32x8(val);
+        self.combine_u32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0)
+    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
-        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u16; 32usize]>(&a.val.0)
+    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
-        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_alignr_256x2(
             self,
-            self.cvt_to_bytes_u16x32(b).val.0,
-            self.cvt_to_bytes_u16x32(a).val.0,
-            SHIFT * 2usize,
+            self.cvt_to_bytes_u32x16(b).val.0,
+            self.cvt_to_bytes_u32x16(a).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_u16x32(u8x64 {
+        self.cvt_from_bytes_u32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+    fn slide_within_blocks_u32x16<const SHIFT: usize>(
         self,
-        a: u16x32<Self>,
-        b: u16x32<Self>,
-    ) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
+        a: u32x16<Self>,
+        b: u32x16<Self>,
+    ) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(
+            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
+    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
+    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
+    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
+    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
+    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
+    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
+    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
     }
     #[inline(always)]
-    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
+    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
+    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
+    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
+    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
+    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
+    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
+    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
+    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
+    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, _) = self.split_u16x32(a);
-        let (b0, _) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
+    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, _) = self.split_u32x16(a);
+        let (b0, _) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (_, a1) = self.split_u16x32(a);
-        let (_, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
+    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (_, a1) = self.split_u32x16(a);
+        let (_, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
+    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.unzip_high_u16x16(a0, a1),
-            self.unzip_high_u16x16(b0, b1),
-        )
+    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_lo = self.zip_low_u16x16(a0, b0);
-        let lo_hi = self.zip_high_u16x16(a0, b0);
-        let hi_lo = self.zip_low_u16x16(a1, b1);
-        let hi_hi = self.zip_high_u16x16(a1, b1);
+    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_lo = self.zip_low_u32x8(a0, b0);
+        let lo_hi = self.zip_high_u32x8(a0, b0);
+        let hi_lo = self.zip_low_u32x8(a1, b1);
+        let hi_hi = self.zip_high_u32x8(a1, b1);
         (
-            self.combine_u16x16(lo_lo, lo_hi),
-            self.combine_u16x16(hi_lo, hi_hi),
+            self.combine_u32x8(lo_lo, lo_hi),
+            self.combine_u32x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_even = self.unzip_low_u16x16(a0, a1);
-        let lo_odd = self.unzip_high_u16x16(a0, a1);
-        let hi_even = self.unzip_low_u16x16(b0, b1);
-        let hi_odd = self.unzip_high_u16x16(b0, b1);
+    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_even = self.unzip_low_u32x8(a0, a1);
+        let lo_odd = self.unzip_high_u32x8(a0, a1);
+        let hi_even = self.unzip_low_u32x8(b0, b1);
+        let hi_odd = self.unzip_high_u32x8(b0, b1);
         (
-            self.combine_u16x16(lo_even, hi_even),
-            self.combine_u16x16(lo_odd, hi_odd),
+            self.combine_u32x8(lo_even, hi_even),
+            self.combine_u32x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let (c0, c1) = self.split_u16x32(c);
-        self.combine_u16x16(
-            self.select_u16x16(a0, b0, c0),
-            self.select_u16x16(a1, b1, c1),
-        )
+    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let (c0, c1) = self.split_u32x16(c);
+        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
+    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
+    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
         (
-            u16x16 {
+            u32x8 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            u16x16 {
+            u32x8 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, src: &[u16; 32usize]) -> u16x32<Avx2> {
-                let (chunks, []) = src.as_chunks::<8usize>() else {
+            fn kernel(token: Avx2, src: &[u32; 16usize]) -> u32x16<Avx2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
                     unreachable!()
                 };
                 let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
                 let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
                 let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
                 let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
-                let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-                let v0 = _mm_shuffle_epi8(v0, mask);
-                let v1 = _mm_shuffle_epi8(v1, mask);
-                let v2 = _mm_shuffle_epi8(v2, mask);
-                let v3 = _mm_shuffle_epi8(v3, mask);
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
                 let tmp0 = _mm_unpacklo_epi32(v0, v1);
                 let tmp1 = _mm_unpackhi_epi32(v0, v1);
                 let tmp2 = _mm_unpacklo_epi32(v2, v3);
@@ -10117,22 +13090,22 @@ impl Simd for Avx2 {
                 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
                 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
                 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                token.combine_u16x16(
-                    token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)),
+                token.combine_u32x8(
+                    token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)),
                 )
             }
         );
         kernel(self, src)
     }
     #[inline(always)]
-    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: u16x32<Avx2>, dest: &mut [u16; 32usize]) -> () {
-                let (v01, v23) = token.split_u16x32(a);
-                let (v0, v1) = token.split_u16x16(v01);
-                let (v2, v3) = token.split_u16x16(v23);
+            fn kernel(token: Avx2, a: u32x16<Avx2>, dest: &mut [u32; 16usize]) -> () {
+                let (v01, v23) = token.split_u32x16(a);
+                let (v0, v1) = token.split_u32x8(v01);
+                let (v2, v3) = token.split_u32x8(v23);
                 let v0 = v0.into();
                 let v1 = v1.into();
                 let v2 = v2.into();
@@ -10145,27 +13118,22 @@ impl Simd for Avx2 {
                 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
                 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
                 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-                let out0 = _mm_shuffle_epi8(out0, mask);
-                let out1 = _mm_shuffle_epi8(out1, mask);
-                let out2 = _mm_shuffle_epi8(out2, mask);
-                let out3 = _mm_shuffle_epi8(out3, mask);
-                let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
                     unreachable!()
                 };
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
                     out0,
                     &mut chunks[0],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
                     out1,
                     &mut chunks[1],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
                     out2,
                     &mut chunks[2],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
                     out3,
                     &mut chunks[3],
                 );
@@ -10174,1234 +13142,1072 @@ impl Simd for Avx2 {
         kernel(self, a, dest);
     }
     #[inline(always)]
-    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: u16x32<Avx2>) -> u8x32<Avx2> {
-                let (a, b) = token.split_u16x32(a);
-                let mask = _mm256_set1_epi16(0xFF);
-                let lo_masked = _mm256_and_si256(a.into(), mask);
-                let hi_masked = _mm256_and_si256(b.into(), mask);
-                let result = _mm256_permute4x64_epi64::<0b_11_01_10_00>(_mm256_packus_epi16(
-                    lo_masked, hi_masked,
-                ));
-                result.simd_into(token)
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_u16x16(a0),
-            self.reinterpret_u8_u16x16(a1),
-        )
+    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_u16x16(a0),
-            self.reinterpret_u32_u16x16(a1),
-        )
+    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
     }
     #[inline(always)]
-    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
-        let half = self.splat_mask16x16(val);
-        self.combine_mask16x16(half, half)
+    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
+        let half = self.splat_mask32x8(val);
+        self.combine_mask32x8(half, half)
     }
     #[inline(always)]
-    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
-        mask16x32 {
+    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
+        mask32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i16; 32usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
-        let lo = self.from_bitmask_mask16x16(bits);
-        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
-        self.combine_mask16x16(lo, hi)
+    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx2, a: mask16x32<Avx2>) -> u64 {
+            fn kernel(token: Avx2, bits: u64) -> mask32x16<Avx2> {
                 {
-                    let lo = _mm256_movemask_epi8(a.val.0[0]) as u32;
-                    let hi = _mm256_movemask_epi8(a.val.0[1]) as u32;
-                    let lo = _pext_u32(lo, 0x5555_5555u32) as u64;
-                    let hi = _pext_u32(hi, 0x5555_5555u32) as u64;
-                    lo | (hi << 16usize)
+                    let bit_lanes = _mm256_set1_epi32(bits as i32);
+                    mask32x16 {
+                        val: crate::support::Aligned512([
+                            {
+                                let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
+                                _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                            },
+                            {
+                                let bit_mask = _mm256_setr_epi32(
+                                    256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
+                                );
+                                _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                            },
+                        ]),
+                        simd: token,
+                    }
                 }
             }
         );
-        kernel(self, a)
+        kernel(self, bits)
     }
     #[inline(always)]
-    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 32usize,
+            index < 16usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            32usize
+            16usize
         );
-        let mut lanes = self.as_array_mask16x32(*a);
+        let mut lanes = self.as_array_mask32x16(*a);
         lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x32(lanes);
+        *a = self.load_array_mask32x16(lanes);
     }
     #[inline(always)]
-    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
+    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
+    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
+    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
+    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
     }
     #[inline(always)]
-    fn select_mask16x32(
+    fn select_mask32x16(
         self,
-        a: mask16x32<Self>,
-        b: mask16x32<Self>,
-        c: mask16x32<Self>,
-    ) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        let (c0, c1) = self.split_mask16x32(c);
-        self.combine_mask16x16(
-            self.select_mask16x16(a0, b0, c0),
-            self.select_mask16x16(a1, b1, c1),
+        a: mask32x16<Self>,
+        b: mask32x16<Self>,
+        c: mask32x16<Self>,
+    ) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        let (c0, c1) = self.split_mask32x16(c);
+        self.combine_mask32x8(
+            self.select_mask32x8(a0, b0, c0),
+            self.select_mask32x8(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(
-            self.simd_eq_mask16x16(a0, b0),
-            self.simd_eq_mask16x16(a1, b1),
-        )
+    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
+    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
+    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
+    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
+    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
         (
-            mask16x16 {
+            mask32x8 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            mask16x16 {
+            mask32x8 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
-        let half = self.splat_i32x8(val);
-        self.combine_i32x8(half, half)
+    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
+        let half = self.splat_f64x4(val);
+        self.combine_f64x4(half, half)
     }
     #[inline(always)]
-    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0)
+    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0)
+    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m256d; 2usize], [f64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
+    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
-        i32x16 {
+    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_alignr_256x2(
             self,
-            self.cvt_to_bytes_i32x16(b).val.0,
-            self.cvt_to_bytes_i32x16(a).val.0,
-            SHIFT * 4usize,
+            self.cvt_to_bytes_f64x8(b).val.0,
+            self.cvt_to_bytes_f64x8(a).val.0,
+            SHIFT * 8usize,
         );
-        self.cvt_from_bytes_i32x16(u8x64 {
+        self.cvt_from_bytes_f64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x16<const SHIFT: usize>(
+    fn slide_within_blocks_f64x8<const SHIFT: usize>(
         self,
-        a: i32x16<Self>,
-        b: i32x16<Self>,
-    ) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(
-            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
+        a: f64x8<Self>,
+        b: f64x8<Self>,
+    ) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
+    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
     }
     #[inline(always)]
-    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
+    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
     }
     #[inline(always)]
-    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
+    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
-    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
     }
     #[inline(always)]
-    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
+    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
+    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
+    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
+    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
+    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
+    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
+    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
+    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
+    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
+    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
+    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, _) = self.split_f64x8(a);
+        let (b0, _) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
     }
     #[inline(always)]
-    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
+    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (_, a1) = self.split_f64x8(a);
+        let (_, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, _) = self.split_i32x16(a);
-        let (b0, _) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
+    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
     }
     #[inline(always)]
-    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (_, a1) = self.split_i32x16(a);
-        let (_, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
+    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
     }
     #[inline(always)]
-    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
+    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_lo = self.zip_low_f64x4(a0, b0);
+        let lo_hi = self.zip_high_f64x4(a0, b0);
+        let hi_lo = self.zip_low_f64x4(a1, b1);
+        let hi_hi = self.zip_high_f64x4(a1, b1);
+        (
+            self.combine_f64x4(lo_lo, lo_hi),
+            self.combine_f64x4(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_even = self.unzip_low_f64x4(a0, a1);
+        let lo_odd = self.unzip_high_f64x4(a0, a1);
+        let hi_even = self.unzip_low_f64x4(b0, b1);
+        let hi_odd = self.unzip_high_f64x4(b0, b1);
+        (
+            self.combine_f64x4(lo_even, hi_even),
+            self.combine_f64x4(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.max_precise_f64x4(a0, b0),
+            self.max_precise_f64x4(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.min_precise_f64x4(a0, b0),
+            self.min_precise_f64x4(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_add_f64x4(a0, b0, c0),
+            self.mul_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_sub_f64x4(a0, b0, c0),
+            self.mul_sub_f64x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
+    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
     }
     #[inline(always)]
-    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_lo = self.zip_low_i32x8(a0, b0);
-        let lo_hi = self.zip_high_i32x8(a0, b0);
-        let hi_lo = self.zip_low_i32x8(a1, b1);
-        let hi_hi = self.zip_high_i32x8(a1, b1);
-        (
-            self.combine_i32x8(lo_lo, lo_hi),
-            self.combine_i32x8(hi_lo, hi_hi),
-        )
+    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
     }
     #[inline(always)]
-    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_even = self.unzip_low_i32x8(a0, a1);
-        let lo_odd = self.unzip_high_i32x8(a0, a1);
-        let hi_even = self.unzip_low_i32x8(b0, b1);
-        let hi_odd = self.unzip_high_i32x8(b0, b1);
-        (
-            self.combine_i32x8(lo_even, hi_even),
-            self.combine_i32x8(lo_odd, hi_odd),
+    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.round_ties_even_f64x4(a0),
+            self.round_ties_even_f64x4(a1),
         )
     }
     #[inline(always)]
-    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let (c0, c1) = self.split_i32x16(c);
-        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
+    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
     }
     #[inline(always)]
-    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
+    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
     }
     #[inline(always)]
-    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
+    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
+    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
         (
-            i32x8 {
+            f64x4 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            i32x8 {
+            f64x4 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_i32x8(a0),
-            self.reinterpret_u32_i32x8(a1),
+    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x8(
+            self.reinterpret_f32_f64x4(a0),
+            self.reinterpret_f32_f64x4(a1),
         )
     }
     #[inline(always)]
-    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
-    }
-    #[inline(always)]
-    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
-        let half = self.splat_u32x8(val);
-        self.combine_u32x8(half, half)
+    fn splat_i64x8(self, val: i64) -> i64x8<Self> {
+        let half = self.splat_i64x4(val);
+        self.combine_i64x4(half, half)
     }
     #[inline(always)]
-    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
+    fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
+    fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0)
+    fn as_array_i64x8(self, a: i64x8<Self>) -> [i64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u32; 16usize]>(&a.val.0)
+    fn as_array_ref_i64x8(self, a: &i64x8<Self>) -> &[i64; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_i64x8(self, a: &mut i64x8<Self>) -> &mut [i64; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [i64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+    fn store_array_i64x8(self, a: i64x8<Self>, dest: &mut [i64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
-        u32x16 {
+    fn cvt_from_bytes_i64x8(self, a: u8x64<Self>) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_i64x8<const SHIFT: usize>(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
-        let result = cross_block_alignr_256x2(
-            self,
-            self.cvt_to_bytes_u32x16(b).val.0,
-            self.cvt_to_bytes_u32x16(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_u32x16(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn slide_within_blocks_u32x16<const SHIFT: usize>(
-        self,
-        a: u32x16<Self>,
-        b: u32x16<Self>,
-    ) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(
-            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
-    }
-    #[inline(always)]
-    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, _) = self.split_u32x16(a);
-        let (b0, _) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
-    }
-    #[inline(always)]
-    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (_, a1) = self.split_u32x16(a);
-        let (_, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
-    }
-    #[inline(always)]
-    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_i64x8(b).val.0,
+            self.cvt_to_bytes_i64x8(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_lo = self.zip_low_u32x8(a0, b0);
-        let lo_hi = self.zip_high_u32x8(a0, b0);
-        let hi_lo = self.zip_low_u32x8(a1, b1);
-        let hi_hi = self.zip_high_u32x8(a1, b1);
-        (
-            self.combine_u32x8(lo_lo, lo_hi),
-            self.combine_u32x8(hi_lo, hi_hi),
+    fn slide_within_blocks_i64x8<const SHIFT: usize>(
+        self,
+        a: i64x8<Self>,
+        b: i64x8<Self>,
+    ) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(
+            self.slide_within_blocks_i64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_even = self.unzip_low_u32x8(a0, a1);
-        let lo_odd = self.unzip_high_u32x8(a0, a1);
-        let hi_even = self.unzip_low_u32x8(b0, b1);
-        let hi_odd = self.unzip_high_u32x8(b0, b1);
-        (
-            self.combine_u32x8(lo_even, hi_even),
-            self.combine_u32x8(lo_odd, hi_odd),
-        )
+    fn add_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let (c0, c1) = self.split_u32x16(c);
-        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
+    fn sub_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
+    fn mul_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
+    fn and_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        (
-            u32x8 {
-                val: crate::support::Aligned256(a.val.0[0]),
-                simd: self,
-            },
-            u32x8 {
-                val: crate::support::Aligned256(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn or_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, src: &[u32; 16usize]) -> u32x16<Avx2> {
-                let (chunks, []) = src.as_chunks::<4usize>() else {
-                    unreachable!()
-                };
-                let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
-                let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
-                let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
-                let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
-                let tmp0 = _mm_unpacklo_epi32(v0, v1);
-                let tmp1 = _mm_unpackhi_epi32(v0, v1);
-                let tmp2 = _mm_unpacklo_epi32(v2, v3);
-                let tmp3 = _mm_unpackhi_epi32(v2, v3);
-                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                token.combine_u32x8(
-                    token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)),
-                )
-            }
-        );
-        kernel(self, src)
+    fn xor_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, a: u32x16<Avx2>, dest: &mut [u32; 16usize]) -> () {
-                let (v01, v23) = token.split_u32x16(a);
-                let (v0, v1) = token.split_u32x8(v01);
-                let (v2, v3) = token.split_u32x8(v23);
-                let v0 = v0.into();
-                let v1 = v1.into();
-                let v2 = v2.into();
-                let v3 = v3.into();
-                let tmp0 = _mm_unpacklo_epi32(v0, v1);
-                let tmp1 = _mm_unpackhi_epi32(v0, v1);
-                let tmp2 = _mm_unpacklo_epi32(v2, v3);
-                let tmp3 = _mm_unpackhi_epi32(v2, v3);
-                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
-                    unreachable!()
-                };
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out0,
-                    &mut chunks[0],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out1,
-                    &mut chunks[1],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out2,
-                    &mut chunks[2],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out3,
-                    &mut chunks[3],
-                );
-            }
-        );
-        kernel(self, a, dest);
+    fn not_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
+    fn shl_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
+    fn shlv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
-        let half = self.splat_mask32x8(val);
-        self.combine_mask32x8(half, half)
+    fn shr_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shrv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [i32; 16usize]>(&a.val.0)
+    fn simd_eq_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx2, bits: u64) -> mask32x16<Avx2> {
-                {
-                    let bit_lanes = _mm256_set1_epi32(bits as i32);
-                    mask32x16 {
-                        val: crate::support::Aligned512([
-                            {
-                                let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
-                                _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
-                            },
-                            {
-                                let bit_mask = _mm256_setr_epi32(
-                                    256, 512, 1024, 2048, 4096, 8192, 16384, 32768,
-                                );
-                                _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
-                            },
-                        ]),
-                        simd: token,
-                    }
-                }
-            }
-        );
-        kernel(self, bits)
+    fn simd_lt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
-        let (lo, hi) = self.split_mask32x16(a);
-        let lo = self.to_bitmask_mask32x8(lo);
-        let hi = self.to_bitmask_mask32x8(hi);
-        lo | (hi << 8usize)
+    fn simd_le_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let mut lanes = self.as_array_mask32x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x16(lanes);
+    fn simd_ge_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
+    fn simd_gt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
+    fn zip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, _) = self.split_i64x8(a);
+        let (b0, _) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0))
     }
     #[inline(always)]
-    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
+    fn zip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (_, a1) = self.split_i64x8(a);
+        let (_, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
+    fn unzip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn select_mask32x16(
-        self,
-        a: mask32x16<Self>,
-        b: mask32x16<Self>,
-        c: mask32x16<Self>,
-    ) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        let (c0, c1) = self.split_mask32x16(c);
-        self.combine_mask32x8(
-            self.select_mask32x8(a0, b0, c0),
-            self.select_mask32x8(a1, b1, c1),
-        )
+    fn unzip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
+    fn interleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_lo = self.zip_low_i64x4(a0, b0);
+        let lo_hi = self.zip_high_i64x4(a0, b0);
+        let hi_lo = self.zip_low_i64x4(a1, b1);
+        let hi_hi = self.zip_high_i64x4(a1, b1);
+        (
+            self.combine_i64x4(lo_lo, lo_hi),
+            self.combine_i64x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
+    fn deinterleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_even = self.unzip_low_i64x4(a0, a1);
+        let lo_odd = self.unzip_high_i64x4(a0, a1);
+        let hi_even = self.unzip_low_i64x4(b0, b1);
+        let hi_odd = self.unzip_high_i64x4(b0, b1);
+        (
+            self.combine_i64x4(lo_even, hi_even),
+            self.combine_i64x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
+    fn select_i64x8(self, a: mask64x8<Self>, b: i64x8<Self>, c: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let (c0, c1) = self.split_i64x8(c);
+        self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
+    fn min_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
+    fn max_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
+    fn split_i64x8(self, a: i64x8<Self>) -> (i64x4<Self>, i64x4<Self>) {
         (
-            mask32x8 {
+            i64x4 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            mask32x8 {
+            i64x4 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
-        let half = self.splat_f64x4(val);
-        self.combine_f64x4(half, half)
+    fn neg_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1))
     }
     #[inline(always)]
-    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn reinterpret_u8_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x8(self, a: i64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i64x4(a0),
+            self.reinterpret_u32_i64x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u64x8(self, val: u64) -> u64x8<Self> {
+        let half = self.splat_u64x4(val);
+        self.combine_u64x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
-        crate::transmute::checked_transmute_copy::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0)
+    fn as_array_u64x8(self, a: u64x8<Self>) -> [u64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m256i; 2usize], [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
-        crate::transmute::checked_cast_ref::<[__m256d; 2usize], [f64; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x8(self, a: &u64x8<Self>) -> &[u64; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m256i; 2usize], [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
-        crate::transmute::checked_cast_mut::<[__m256d; 2usize], [f64; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x8(self, a: &mut u64x8<Self>) -> &mut [u64; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m256i; 2usize], [u64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+    fn store_array_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn cvt_from_bytes_u64x8(self, a: u8x64<Self>) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn slide_u64x8<const SHIFT: usize>(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_alignr_256x2(
             self,
-            self.cvt_to_bytes_f64x8(b).val.0,
-            self.cvt_to_bytes_f64x8(a).val.0,
+            self.cvt_to_bytes_u64x8(b).val.0,
+            self.cvt_to_bytes_u64x8(a).val.0,
             SHIFT * 8usize,
         );
-        self.cvt_from_bytes_f64x8(u8x64 {
+        self.cvt_from_bytes_u64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x8<const SHIFT: usize>(
         self,
-        a: f64x8<Self>,
-        b: f64x8<Self>,
-    ) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
-    }
-    #[inline(always)]
-    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
-    }
-    #[inline(always)]
-    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
-    }
-    #[inline(always)]
-    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.approximate_recip_f64x4(a0),
-            self.approximate_recip_f64x4(a1),
+        a: u64x8<Self>,
+        b: u64x8<Self>,
+    ) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(
+            self.slide_within_blocks_u64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
+    fn add_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
+    fn sub_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
+    fn mul_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
+    fn and_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
+    fn or_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
+    fn xor_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
+    fn not_u64x8(self, a: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1))
     }
     #[inline(always)]
-    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
+    fn shl_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
+    fn shlv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, _) = self.split_f64x8(a);
-        let (b0, _) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
+    fn shr_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (_, a1) = self.split_f64x8(a);
-        let (_, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
+    fn shrv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
+    fn simd_eq_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
+    fn simd_lt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_lo = self.zip_low_f64x4(a0, b0);
-        let lo_hi = self.zip_high_f64x4(a0, b0);
-        let hi_lo = self.zip_low_f64x4(a1, b1);
-        let hi_hi = self.zip_high_f64x4(a1, b1);
-        (
-            self.combine_f64x4(lo_lo, lo_hi),
-            self.combine_f64x4(hi_lo, hi_hi),
-        )
+    fn simd_le_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_even = self.unzip_low_f64x4(a0, a1);
-        let lo_odd = self.unzip_high_f64x4(a0, a1);
-        let hi_even = self.unzip_low_f64x4(b0, b1);
-        let hi_odd = self.unzip_high_f64x4(b0, b1);
-        (
-            self.combine_f64x4(lo_even, hi_even),
-            self.combine_f64x4(lo_odd, hi_odd),
-        )
+    fn simd_ge_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
+    fn simd_gt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
+    fn zip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, _) = self.split_u64x8(a);
+        let (b0, _) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0))
     }
     #[inline(always)]
-    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.max_precise_f64x4(a0, b0),
-            self.max_precise_f64x4(a1, b1),
-        )
+    fn zip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (_, a1) = self.split_u64x8(a);
+        let (_, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.min_precise_f64x4(a0, b0),
-            self.min_precise_f64x4(a1, b1),
-        )
+    fn unzip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_add_f64x4(a0, b0, c0),
-            self.mul_add_f64x4(a1, b1, c1),
-        )
+    fn unzip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_sub_f64x4(a0, b0, c0),
-            self.mul_sub_f64x4(a1, b1, c1),
+    fn interleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_lo = self.zip_low_u64x4(a0, b0);
+        let lo_hi = self.zip_high_u64x4(a0, b0);
+        let hi_lo = self.zip_low_u64x4(a1, b1);
+        let hi_hi = self.zip_high_u64x4(a1, b1);
+        (
+            self.combine_u64x4(lo_lo, lo_hi),
+            self.combine_u64x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
-    }
-    #[inline(always)]
-    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
-    }
-    #[inline(always)]
-    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.round_ties_even_f64x4(a0),
-            self.round_ties_even_f64x4(a1),
+    fn deinterleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_even = self.unzip_low_u64x4(a0, a1);
+        let lo_odd = self.unzip_high_u64x4(a0, a1);
+        let hi_even = self.unzip_low_u64x4(b0, b1);
+        let hi_odd = self.unzip_high_u64x4(b0, b1);
+        (
+            self.combine_u64x4(lo_even, hi_even),
+            self.combine_u64x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
+    fn select_u64x8(self, a: mask64x8<Self>, b: u64x8<Self>, c: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let (c0, c1) = self.split_u64x8(c);
+        self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
+    fn min_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_mask64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
+    fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>) {
         (
-            f64x4 {
+            u64x4 {
                 val: crate::support::Aligned256(a.val.0[0]),
                 simd: self,
             },
-            f64x4 {
+            u64x4 {
                 val: crate::support::Aligned256(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f32x8(
-            self.reinterpret_f32_f64x4(a0),
-            self.reinterpret_f32_f64x4(a1),
+    fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, src: &[u64; 8usize]) -> u64x8<Avx2> {
+                let (chunks, []) = src.as_chunks::<2usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]);
+                let out0 = _mm_unpacklo_epi64(v0, v1);
+                let out1 = _mm_unpacklo_epi64(v2, v3);
+                let out2 = _mm_unpackhi_epi64(v0, v1);
+                let out3 = _mm_unpackhi_epi64(v2, v3);
+                token.combine_u64x4(
+                    token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u64x8<Avx2>, dest: &mut [u64; 8usize]) -> () {
+                let (v01, v23) = token.split_u64x8(a);
+                let (v0, v1) = token.split_u64x4(v01);
+                let (v2, v3) = token.split_u64x4(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let out0 = _mm_unpacklo_epi64(v0, v2);
+                let out1 = _mm_unpackhi_epi64(v0, v2);
+                let out2 = _mm_unpacklo_epi64(v1, v3);
+                let out3 = _mm_unpackhi_epi64(v1, v3);
+                let (chunks, []) = dest.as_chunks_mut::<2usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest);
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x8(self, a: u64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u64x4(a0),
+            self.reinterpret_u32_u64x4(a1),
         )
     }
     #[inline(always)]
@@ -11703,6 +14509,36 @@ impl<S: Simd> From<f64x4<S>> for __m256d {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
+impl<S: Simd> SimdFrom<__m256i, S> for i64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m256i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i64x4<S>> for __m256i {
+    #[inline(always)]
+    fn from(value: i64x4<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__m256i, S> for u64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m256i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u64x4<S>> for __m256i {
+    #[inline(always)]
+    fn from(value: u64x4<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
 impl<S: Simd> SimdFrom<__m256i, S> for mask64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m256i) -> Self {
diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 976ebd2ad..1a6ff0288 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -14,9 +14,9 @@
 use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
@@ -51,6 +51,8 @@ impl ArchTypes for Avx512 {
     type u32x4 = crate::support::Aligned128<__m128i>;
     type mask32x4 = __mmask8;
     type f64x2 = crate::support::Aligned128<__m128d>;
+    type i64x2 = crate::support::Aligned128<__m128i>;
+    type u64x2 = crate::support::Aligned128<__m128i>;
     type mask64x2 = __mmask8;
     type f32x8 = crate::support::Aligned256<__m256>;
     type i8x32 = crate::support::Aligned256<__m256i>;
@@ -63,6 +65,8 @@ impl ArchTypes for Avx512 {
     type u32x8 = crate::support::Aligned256<__m256i>;
     type mask32x8 = __mmask8;
     type f64x4 = crate::support::Aligned256<__m256d>;
+    type i64x4 = crate::support::Aligned256<__m256i>;
+    type u64x4 = crate::support::Aligned256<__m256i>;
     type mask64x4 = __mmask8;
     type f32x16 = crate::support::Aligned512<__m512>;
     type i8x64 = crate::support::Aligned512<__m512i>;
@@ -75,6 +79,8 @@ impl ArchTypes for Avx512 {
     type u32x16 = crate::support::Aligned512<__m512i>;
     type mask32x16 = __mmask16;
     type f64x8 = crate::support::Aligned512<__m512d>;
+    type i64x8 = crate::support::Aligned512<__m512i>;
+    type u64x8 = crate::support::Aligned512<__m512i>;
     type mask64x8 = __mmask8;
 }
 impl Simd for Avx512 {
@@ -86,6 +92,8 @@ impl Simd for Avx512 {
     type i16s = i16x32<Self>;
     type u32s = u32x16<Self>;
     type i32s = i32x16<Self>;
+    type u64s = u64x8<Self>;
+    type i64s = i64x8<Self>;
     type mask8s = mask8x64<Self>;
     type mask16s = mask16x32<Self>;
     type mask32s = mask32x16<Self>;
@@ -3906,349 +3914,194 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
-        mask64x2 {
-            val: (if val { 3u64 } else { 0 }) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+    fn splat_i64x2(self, val: i64) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: [i64; 2usize]) -> mask64x2<Avx512> {
-                let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask64x2 {
-                    val: _mm_movepi64_mask(lanes),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, val: i64) -> i64x2<Avx512> {
+                _mm_set1_epi64x(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: mask64x2<Avx512>) -> [i64; 2usize] {
-                let lanes = _mm_movm_epi64(a.val);
-                crate::transmute::checked_transmute_copy(&lanes)
-            }
-        );
-        kernel(self, a)
+    fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
-        mask64x2 {
-            val: (bits & 3u64) as _,
+    fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
-        u64::from((a).val) & 3u64
+    fn as_array_i64x2(self, a: i64x2<Self>) -> [i64; 2usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 2usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            2usize
-        );
-        let bit = 1u64 << index;
-        let bits = u64::from((a).val);
-        let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask64x2 {
-            val: (bits) as _,
-            simd: self,
-        };
+    fn as_array_ref_i64x2(self, a: &i64x2<Self>) -> &[i64; 2usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
-        mask64x2 {
-            val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _,
-            simd: self,
-        }
+    fn as_array_mut_i64x2(self, a: &mut i64x2<Self>) -> &mut [i64; 2usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [i64; 2usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
-        mask64x2 {
-            val: ((u64::from((a).val) | u64::from((b).val)) & 3u64) as _,
-            simd: self,
-        }
+    fn store_array_i64x2(self, a: i64x2<Self>, dest: &mut [i64; 2usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
-        mask64x2 {
-            val: ((u64::from((a).val) ^ u64::from((b).val)) & 3u64) as _,
+    fn cvt_from_bytes_i64x2(self, a: u8x16<Self>) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
-        mask64x2 {
-            val: ((!u64::from((a).val)) & 3u64) as _,
+    fn cvt_to_bytes_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn select_mask64x2(
-        self,
-        a: mask64x2<Self>,
-        b: mask64x2<Self>,
-        c: mask64x2<Self>,
-    ) -> mask64x2<Self> {
-        mask64x2 {
-            val: (((u64::from((a).val) & u64::from((b).val))
-                | ((!u64::from((a).val)) & u64::from((c).val)))
-                & 3u64) as _,
-            simd: self,
+    fn slide_i64x2<const SHIFT: usize>(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        if SHIFT >= 2usize {
+            return b;
         }
-    }
-    #[inline(always)]
-    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
-        mask64x2 {
-            val: (!u64::from(a.val ^ b.val) & 3u64) as _,
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i64x2(b).val.0,
+            self.cvt_to_bytes_i64x2(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x2(u8x16 {
+            val: crate::support::Aligned128(result),
             simd: self,
-        }
-    }
-    #[inline(always)]
-    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
-        let bits = u64::from((a).val) & 3u64;
-        bits != 0
-    }
-    #[inline(always)]
-    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
-        let bits = u64::from((a).val) & 3u64;
-        bits == 3u64
-    }
-    #[inline(always)]
-    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
-        let bits = u64::from((a).val) & 3u64;
-        bits != 3u64
-    }
-    #[inline(always)]
-    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
-        let bits = u64::from((a).val) & 3u64;
-        bits == 0
+        })
     }
     #[inline(always)]
-    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
-        let bits = (u64::from(a.val) | (u64::from(b.val) << 2usize)) & 15u64;
-        mask64x4 {
-            val: bits as _,
-            simd: self,
-        }
+    fn slide_within_blocks_i64x2<const SHIFT: usize>(
+        self,
+        a: i64x2<Self>,
+        b: i64x2<Self>,
+    ) -> i64x2<Self> {
+        self.slide_i64x2::<SHIFT>(a, b)
     }
     #[inline(always)]
-    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+    fn add_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: f32) -> f32x8<Avx512> {
-                _mm256_set1_ps(val).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, val)
-    }
-    #[inline(always)]
-    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
-        crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
-        crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0)
-    }
-    #[inline(always)]
-    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
-    }
-    #[inline(always)]
-    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
-        f32x8 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn sub_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f32x8<Avx512>,
-                b: f32x8<Avx512>,
-                shift: usize,
-            ) -> f32x8<Avx512> {
-                if shift >= 8usize {
-                    return b;
-                }
-                let idx = _mm256_add_epi8(
-                    _mm256_setr_epi8(
-                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                    ),
-                    _mm256_set1_epi8((shift * 4usize) as i8),
-                );
-                let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_f32x8(a).val.0,
-                    idx,
-                    token.cvt_to_bytes_f32x8(b).val.0,
-                );
-                token.cvt_from_bytes_f32x8(u8x32 {
-                    val: crate::support::Aligned256(result),
-                    simd: token,
-                })
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, SHIFT)
-    }
-    #[inline(always)]
-    fn slide_within_blocks_f32x8<const SHIFT: usize>(
-        self,
-        a: f32x8<Self>,
-        b: f32x8<Self>,
-    ) -> f32x8<Self> {
-        if SHIFT == 0 {
-            return a;
-        }
-        if SHIFT >= 4usize {
-            return b;
-        }
-        let a = self.cvt_to_bytes_f32x8(a).val.0;
-        let b = self.cvt_to_bytes_f32x8(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT * 4usize);
-        self.cvt_from_bytes_f32x8(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn mul_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_mullo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn and_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_and_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn or_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_sqrt_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_or_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn xor_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_rcp14_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_xor_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn not_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_add_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, shift: u32) -> i64x2<Avx512> {
+                _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn shlv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_sub_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn shr_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_mul_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, shift: u32) -> i64x2<Avx512> {
+                _mm_sra_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn shrv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_div_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_srav_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                let mask = _mm256_set1_ps(-0.0);
-                _mm256_or_ps(
-                    _mm256_and_ps(mask, b.into()),
-                    _mm256_andnot_ps(mask, a.into()),
-                )
-                .simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn simd_eq_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmpeq_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4256,12 +4109,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn simd_lt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmplt_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4269,12 +4122,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn simd_le_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmple_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4282,12 +4135,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn simd_ge_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmpge_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4295,12 +4148,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+    fn simd_gt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmpgt_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4308,653 +4161,3865 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn zip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_permutex2var_ps(
-                    a.into(),
-                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn zip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_permutex2var_ps(
-                    a.into(),
-                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn unzip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_permutex2var_ps(
-                    a.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(2, 0), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn unzip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_permutex2var_ps(
-                    a.into(),
-                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(3, 1), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+    fn interleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f32x8<Avx512>,
-                b: f32x8<Avx512>,
-            ) -> (f32x8<Avx512>, f32x8<Avx512>) {
+                a: i64x2<Avx512>,
+                b: i64x2<Avx512>,
+            ) -> (i64x2<Avx512>, i64x2<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
-                        .simd_into(token),
-                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
-                        .simd_into(token),
+                    _mm_permutex2var_epi64(a, _mm_set_epi64x(2, 0), b).simd_into(token),
+                    _mm_permutex2var_epi64(a, _mm_set_epi64x(3, 1), b).simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+    fn select_i64x2(self, a: mask64x2<Self>, b: i64x2<Self>, c: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f32x8<Avx512>,
-                b: f32x8<Avx512>,
-            ) -> (f32x8<Avx512>, f32x8<Avx512>) {
-                let a = a.into();
-                let b = b.into();
-                (
-                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
-                        .simd_into(token),
-                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
-                        .simd_into(token),
-                )
+                a: mask64x2<Avx512>,
+                b: i64x2<Avx512>,
+                c: i64x2<Avx512>,
+            ) -> i64x2<Avx512> {
+                _mm_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn min_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_max_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_min_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn max_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_min_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_max_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn combine_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>, b: i64x2<Avx512>) -> i64x4<Avx512> {
+                _mm256_setr_m128i(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+    fn neg_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>) -> i64x2<Avx512> {
+                _mm_sub_epi64(_mm_setzero_si128(), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_u8_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f32x8<Avx512>,
-                b: f32x8<Avx512>,
-                c: f32x8<Avx512>,
-            ) -> f32x8<Avx512> {
-                _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>) -> u8x16<Avx512> {
+                __m128i::from(a).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_u32_i64x2(self, a: i64x2<Self>) -> u32x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f32x8<Avx512>,
-                b: f32x8<Avx512>,
-                c: f32x8<Avx512>,
-            ) -> f32x8<Avx512> {
-                _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x2<Avx512>) -> u32x4<Avx512> {
+                __m128i::from(a).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn splat_u64x2(self, val: u64) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, val: u64) -> u64x2<Avx512> {
+                _mm_set1_epi64x(val.cast_signed()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, val)
     }
     #[inline(always)]
-    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
-            }
+    fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u64x2(self, a: u64x2<Self>) -> [u64; 2usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [u64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u64x2(self, a: &u64x2<Self>) -> &[u64; 2usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [u64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u64x2(self, a: &mut u64x2<Self>) -> &mut [u64; 2usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [u64; 2usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u64x2(self, a: u64x2<Self>, dest: &mut [u64; 2usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u64x2(self, a: u8x16<Self>) -> u64x2<Self> {
+        u64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u64x2<const SHIFT: usize>(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u64x2(b).val.0,
+            self.cvt_to_bytes_u64x2(a).val.0,
+            SHIFT * 8usize,
         );
-        kernel(self, a)
+        self.cvt_from_bytes_u64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        a - self.trunc_f32x8(a)
+    fn slide_within_blocks_u64x2<const SHIFT: usize>(
+        self,
+        a: u64x2<Self>,
+        b: u64x2<Self>,
+    ) -> u64x2<Self> {
+        self.slide_u64x2::<SHIFT>(a, b)
     }
     #[inline(always)]
-    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+    fn add_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+    fn sub_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: mask32x8<Avx512>,
-                b: f32x8<Avx512>,
-                c: f32x8<Avx512>,
-            ) -> f32x8<Avx512> {
-                _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+    fn mul_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x16<Avx512> {
-                _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_mullo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+    fn and_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> (f32x4<Avx512>, f32x4<Avx512>) {
-                (
-                    _mm256_extractf128_ps::<0>(a.into()).simd_into(token),
-                    _mm256_extractf128_ps::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_and_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+    fn or_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f64x4<Avx512> {
-                _mm256_castps_pd(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_or_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+    fn xor_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_castps_si256(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_xor_si128(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+    fn not_u64x2(self, a: u64x2<Self>) -> u64x2<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u8x32<Avx512> {
-                _mm256_castps_si256(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, shift: u32) -> u64x2<Avx512> {
+                _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+    fn shlv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_castps_si256(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+    fn shr_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_cvttps_epu32(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, shift: u32) -> u64x2<Avx512> {
+                _mm_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+    fn shrv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
-                let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
-                let mut converted = _mm256_cvttps_epu32(a);
-                let exceeds_unsigned_range =
-                    _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a);
-                converted = _mm256_mask_blend_epi32(
-                    exceeds_unsigned_range,
-                    converted,
-                    _mm256_set1_epi32(u32::MAX.cast_signed()),
-                );
-                converted.simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_srlv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+    fn simd_eq_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_cvttps_epi32(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmpeq_epu64_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+    fn simd_lt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x8<Avx512>) -> i32x8<Avx512> {
-                let a = a.into();
-                let in_range = _mm256_cmp_ps_mask::<17i32>(a, _mm256_set1_ps(2147483648.0));
-                let mut converted =
-                    _mm256_mask_cvttps_epi32(_mm256_set1_epi32(i32::MAX), in_range, a);
-                let is_not_nan = _mm256_cmp_ps_mask::<7i32>(a, a);
-                converted = _mm256_mask_blend_epi32(is_not_nan, _mm256_setzero_si256(), converted);
-                converted.simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmplt_epu64_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+    fn simd_le_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: i8) -> i8x32<Avx512> {
-                _mm256_set1_epi8(val).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmple_epu64_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
         );
-        kernel(self, val)
-    }
-    #[inline(always)]
-    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0)
-    }
-    #[inline(always)]
-    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
-    }
-    #[inline(always)]
-    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
-        i8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn simd_ge_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: i8x32<Avx512>,
-                b: i8x32<Avx512>,
-                shift: usize,
-            ) -> i8x32<Avx512> {
-                if shift >= 32usize {
-                    return b;
-                }
-                let idx = _mm256_add_epi8(
-                    _mm256_setr_epi8(
-                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                    ),
-                    _mm256_set1_epi8((shift) as i8),
-                );
-                let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_i8x32(a).val.0,
-                    idx,
-                    token.cvt_to_bytes_i8x32(b).val.0,
-                );
-                token.cvt_from_bytes_i8x32(u8x32 {
-                    val: crate::support::Aligned256(result),
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmpge_epu64_mask(a.into(), b.into()),
                     simd: token,
-                })
+                }
             }
         );
-        kernel(self, a, b, SHIFT)
-    }
-    #[inline(always)]
-    fn slide_within_blocks_i8x32<const SHIFT: usize>(
-        self,
-        a: i8x32<Self>,
-        b: i8x32<Self>,
-    ) -> i8x32<Self> {
-        if SHIFT == 0 {
-            return a;
-        }
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let a = self.cvt_to_bytes_i8x32(a).val.0;
-        let b = self.cvt_to_bytes_i8x32(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT);
-        self.cvt_from_bytes_i8x32(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn simd_gt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_add_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> mask64x2<Avx512> {
+                mask64x2 {
+                    val: _mm_cmpgt_epu64_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn zip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_sub_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn zip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                let dst_even = _mm256_mullo_epi16(a.into(), b.into());
-                let dst_odd = _mm256_mullo_epi16(
-                    _mm256_srli_epi16::<8>(a.into()),
-                    _mm256_srli_epi16::<8>(b.into()),
-                );
-                _mm256_or_si256(
-                    _mm256_slli_epi16(dst_odd, 8),
-                    _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn unzip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(2, 0), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn unzip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_permutex2var_epi64(a.into(), _mm_set_epi64x(3, 1), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn interleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(
+                token: Avx512,
+                a: u64x2<Avx512>,
+                b: u64x2<Avx512>,
+            ) -> (u64x2<Avx512>, u64x2<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm_permutex2var_epi64(a, _mm_set_epi64x(2, 0), b).simd_into(token),
+                    _mm_permutex2var_epi64(a, _mm_set_epi64x(3, 1), b).simd_into(token),
+                )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        a ^ !0
-    }
-    #[inline(always)]
-    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+    fn select_u64x2(self, a: mask64x2<Self>, b: u64x2<Self>, c: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, shift: u32) -> i8x32<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 =
-                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let hi_16 =
-                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
-                let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
-                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(
+                token: Avx512,
+                a: mask64x2<Avx512>,
+                b: u64x2<Avx512>,
+                c: u64x2<Avx512>,
+            ) -> u64x2<Avx512> {
+                _mm_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn min_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm256_setzero_si256();
-                let value_extend = zero;
-                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm256_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_min_epu64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+    fn max_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, shift: u32) -> i8x32<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 =
-                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let hi_16 =
-                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
-                let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
-                let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
-                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x2<Avx512> {
+                _mm_max_epu64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn combine_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm256_setzero_si256();
-                let value_extend = _mm256_cmpgt_epi8(zero, val);
-                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm256_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u64x2<Avx512>, b: u64x2<Avx512>) -> u64x4<Avx512> {
+                _mm256_setr_m128i(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn reinterpret_u8_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmpeq_epi8_mask(a.into(), b.into()),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, a: u64x2<Avx512>) -> u8x16<Avx512> {
+                __m128i::from(a).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn reinterpret_u32_u64x2(self, a: u64x2<Self>) -> u32x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmplt_epi8_mask(a.into(), b.into()),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, a: u64x2<Avx512>) -> u32x4<Avx512> {
+                __m128i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+        mask64x2 {
+            val: (if val { 3u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i64; 2usize]) -> mask64x2<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask64x2 {
+                    val: _mm_movepi64_mask(lanes),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask64x2<Avx512>) -> [i64; 2usize] {
+                let lanes = _mm_movm_epi64(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        mask64x2 {
+            val: (bits & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        u64::from((a).val) & 3u64
+    }
+    #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x2 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: ((!u64::from((a).val)) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask64x2(
+        self,
+        a: mask64x2<Self>,
+        b: mask64x2<Self>,
+        c: mask64x2<Self>,
+    ) -> mask64x2<Self> {
+        mask64x2 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        mask64x2 {
+            val: (!u64::from(a.val ^ b.val) & 3u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits == 3u64
+    }
+    #[inline(always)]
+    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits != 3u64
+    }
+    #[inline(always)]
+    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        let bits = u64::from((a).val) & 3u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 2usize)) & 15u64;
+        mask64x4 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: f32) -> f32x8<Avx512> {
+                _mm256_set1_ps(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256, [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256, [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256, [f32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+                shift: usize,
+            ) -> f32x8<Avx512> {
+                if shift >= 8usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_f32x8(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_f32x8(b).val.0,
+                );
+                token.cvt_from_bytes_f32x8(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x8<const SHIFT: usize>(
+        self,
+        a: f32x8<Self>,
+        b: f32x8<Self>,
+    ) -> f32x8<Self> {
+        if SHIFT == 0 {
+            return a;
+        }
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let a = self.cvt_to_bytes_f32x8(a).val.0;
+        let b = self.cvt_to_bytes_f32x8(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 4usize);
+        self.cvt_from_bytes_f32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_xor_ps(a.into(), _mm256_set1_ps(-0.0)).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_sqrt_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_rcp14_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_add_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_sub_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_mul_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_div_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                let mask = _mm256_set1_ps(-0.0);
+                _mm256_or_ps(
+                    _mm256_and_ps(mask, b.into()),
+                    _mm256_andnot_ps(mask, a.into()),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<17i32>(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<18i32>(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<29i32>(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmp_ps_mask::<30i32>(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_permutex2var_ps(
+                    a.into(),
+                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+            ) -> (f32x8<Avx512>, f32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+            ) -> (f32x8<Avx512>, f32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_ps(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_max_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_min_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_range_ps::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_range_ps::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+                c: f32x8<Avx512>,
+            ) -> f32x8<Avx512> {
+                _mm256_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x8<Avx512>,
+                b: f32x8<Avx512>,
+                c: f32x8<Avx512>,
+            ) -> f32x8<Avx512> {
+                _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        a - self.trunc_f32x8(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask32x8<Avx512>,
+                b: f32x8<Avx512>,
+                c: f32x8<Avx512>,
+            ) -> f32x8<Avx512> {
+                _mm256_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>, b: f32x8<Avx512>) -> f32x16<Avx512> {
+                _mm512_insertf32x8::<1>(_mm512_castps256_ps512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> (f32x4<Avx512>, f32x4<Avx512>) {
+                (
+                    _mm256_extractf128_ps::<0>(a.into()).simd_into(token),
+                    _mm256_extractf128_ps::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> f64x4<Avx512> {
+                _mm256_castps_pd(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_castps_si256(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u8x32<Avx512> {
+                _mm256_castps_si256(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_castps_si256(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_cvttps_epu32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> u32x8<Avx512> {
+                let a = _mm256_max_ps(a.into(), _mm256_setzero_ps());
+                let mut converted = _mm256_cvttps_epu32(a);
+                let exceeds_unsigned_range =
+                    _mm256_cmp_ps_mask::<17i32>(_mm256_set1_ps(4294967040.0), a);
+                converted = _mm256_mask_blend_epi32(
+                    exceeds_unsigned_range,
+                    converted,
+                    _mm256_set1_epi32(u32::MAX.cast_signed()),
+                );
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_cvttps_epi32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x8<Avx512>) -> i32x8<Avx512> {
+                let a = a.into();
+                let in_range = _mm256_cmp_ps_mask::<17i32>(a, _mm256_set1_ps(2147483648.0));
+                let mut converted =
+                    _mm256_mask_cvttps_epi32(_mm256_set1_epi32(i32::MAX), in_range, a);
+                let is_not_nan = _mm256_cmp_ps_mask::<7i32>(a, a);
+                converted = _mm256_mask_blend_epi32(is_not_nan, _mm256_setzero_si256(), converted);
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: i8) -> i8x32<Avx512> {
+                _mm256_set1_epi8(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i8; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x32<Avx512>,
+                b: i8x32<Avx512>,
+                shift: usize,
+            ) -> i8x32<Avx512> {
+                if shift >= 32usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_i8x32(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i8x32(b).val.0,
+                );
+                token.cvt_from_bytes_i8x32(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+        self,
+        a: i8x32<Self>,
+        b: i8x32<Self>,
+    ) -> i8x32<Self> {
+        if SHIFT == 0 {
+            return a;
+        }
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let a = self.cvt_to_bytes_i8x32(a).val.0;
+        let b = self.cvt_to_bytes_i8x32(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT);
+        self.cvt_from_bytes_i8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_add_epi8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_sub_epi8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+                let dst_odd = _mm256_mullo_epi16(
+                    _mm256_srli_epi16::<8>(a.into()),
+                    _mm256_srli_epi16::<8>(b.into()),
+                );
+                _mm256_or_si256(
+                    _mm256_slli_epi16(dst_odd, 8),
+                    _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, shift: u32) -> i8x32<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 =
+                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+                let hi_16 =
+                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+                let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
+                let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
+                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = zero;
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, shift: u32) -> i8x32<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 =
+                    _mm256_unpacklo_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+                let hi_16 =
+                    _mm256_unpackhi_epi8(val, _mm256_cmpgt_epi8(_mm256_setzero_si256(), val));
+                let lo_shifted = _mm256_sra_epi16(lo_16, shift_count);
+                let hi_shifted = _mm256_sra_epi16(hi_16, shift_count);
+                _mm256_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = _mm256_cmpgt_epi8(zero, val);
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_srav_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_srav_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpeq_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmplt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmple_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpge_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpgt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
+                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
+                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
+                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x32<Avx512>,
+                b: i8x32<Avx512>,
+            ) -> (i8x32<Avx512>, i8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41,
+                            10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56,
+                            25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i8x32<Avx512>,
+                b: i8x32<Avx512>,
+            ) -> (i8x32<Avx512>, i8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36,
+                            38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37,
+                            39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x32<Avx512>,
+                b: i8x32<Avx512>,
+                c: i8x32<Avx512>,
+            ) -> i8x32<Avx512> {
+                _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_min_epi8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_max_epi8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x64<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>) -> (i8x16<Avx512>, i8x16<Avx512>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>) -> i8x32<Avx512> {
+                _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>) -> u8x32<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i8x32<Avx512>) -> u32x8<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: u8) -> u8x32<Avx512> {
+                _mm256_set1_epi8(val.cast_signed()).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x32<Avx512>,
+                b: u8x32<Avx512>,
+                shift: usize,
+            ) -> u8x32<Avx512> {
+                if shift >= 32usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_u8x32(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u8x32(b).val.0,
+                );
+                token.cvt_from_bytes_u8x32(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+        self,
+        a: u8x32<Self>,
+        b: u8x32<Self>,
+    ) -> u8x32<Self> {
+        if SHIFT == 0 {
+            return a;
+        }
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let a = self.cvt_to_bytes_u8x32(a).val.0;
+        let b = self.cvt_to_bytes_u8x32(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT);
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_add_epi8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_sub_epi8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+                let dst_odd = _mm256_mullo_epi16(
+                    _mm256_srli_epi16::<8>(a.into()),
+                    _mm256_srli_epi16::<8>(b.into()),
+                );
+                _mm256_or_si256(
+                    _mm256_slli_epi16(dst_odd, 8),
+                    _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, shift: u32) -> u8x32<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+                let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
+                let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = zero;
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, shift: u32) -> u8x32<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
+                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
+                let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
+                let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm256_setzero_si256();
+                let value_extend = zero;
+                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm256_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask);
+                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpeq_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmplt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmple_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpge_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
+                mask8x32 {
+                    val: _mm256_cmpgt_epu8_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
+                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
+                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
+                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_permutex2var_epi8(
+                    a.into(),
+                    _mm256_setr_epi8(
+                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
+                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x32<Avx512>,
+                b: u8x32<Avx512>,
+            ) -> (u8x32<Avx512>, u8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41,
+                            10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56,
+                            25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u8x32<Avx512>,
+                b: u8x32<Avx512>,
+            ) -> (u8x32<Avx512>, u8x32<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36,
+                            38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi8(
+                        a,
+                        _mm256_setr_epi8(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37,
+                            39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask8x32<Avx512>,
+                b: u8x32<Avx512>,
+                c: u8x32<Avx512>,
+            ) -> u8x32<Avx512> {
+                _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_min_epu8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+                _mm256_max_epu8(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x64<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>) -> (u8x16<Avx512>, u8x16<Avx512>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_cvtepu8_epi16(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x32<Avx512>) -> u32x8<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
+        mask8x32 {
+            val: (if val { 4294967295u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i8; 32usize]) -> mask8x32<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask8x32 {
+                    val: _mm256_movepi8_mask(lanes),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask8x32<Avx512>) -> [i8; 32usize] {
+                let lanes = _mm256_movm_epi8(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        mask8x32 {
+            val: (bits & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        u64::from((a).val) & 4294967295u64
+    }
+    #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x32 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: ((!u64::from((a).val)) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask8x32(
+        self,
+        a: mask8x32<Self>,
+        b: mask8x32<Self>,
+        c: mask8x32<Self>,
+    ) -> mask8x32<Self> {
+        mask8x32 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        mask8x32 {
+            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 4294967295u64
+    }
+    #[inline(always)]
+    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 4294967295u64
+    }
+    #[inline(always)]
+    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 32usize)) & u64::MAX;
+        mask8x64 {
+            val: bits,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask8x16 {
+                val: (bits & 65535u64) as _,
+                simd: self,
+            },
+            mask8x16 {
+                val: ((bits >> 16usize) & 65535u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: i16) -> i16x16<Avx512> {
+                _mm256_set1_epi16(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x16<Avx512>,
+                b: i16x16<Avx512>,
+                shift: usize,
+            ) -> i16x16<Avx512> {
+                if shift >= 16usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 2usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_i16x16(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i16x16(b).val.0,
+                );
+                token.cvt_from_bytes_i16x16(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x16<const SHIFT: usize>(
+        self,
+        a: i16x16<Self>,
+        b: i16x16<Self>,
+    ) -> i16x16<Self> {
+        if SHIFT == 0 {
+            return a;
+        }
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let a = self.cvt_to_bytes_i16x16(a).val.0;
+        let b = self.cvt_to_bytes_i16x16(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 2usize);
+        self.cvt_from_bytes_i16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_add_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_sub_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_mullo_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, shift: u32) -> i16x16<Avx512> {
+                _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, shift: u32) -> i16x16<Avx512> {
+                _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_srav_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpeq_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmplt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmple_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpge_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpgt_epi16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x16<Avx512>,
+                b: i16x16<Avx512>,
+            ) -> (i16x16<Avx512>, i16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i16x16<Avx512>,
+                b: i16x16<Avx512>,
+            ) -> (i16x16<Avx512>, i16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x16<Avx512>,
+                b: i16x16<Avx512>,
+                c: i16x16<Avx512>,
+            ) -> i16x16<Avx512> {
+                _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_min_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_max_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x32<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>) -> (i16x8<Avx512>, i16x8<Avx512>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>) -> i16x16<Avx512> {
+                _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>) -> u8x32<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i16x16<Avx512>) -> u32x8<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: u16) -> u16x16<Avx512> {
+                _mm256_set1_epi16(val.cast_signed()).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x16<Avx512>,
+                b: u16x16<Avx512>,
+                shift: usize,
+            ) -> u16x16<Avx512> {
+                if shift >= 16usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 2usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_u16x16(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_u16x16(b).val.0,
+                );
+                token.cvt_from_bytes_u16x16(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+        self,
+        a: u16x16<Self>,
+        b: u16x16<Self>,
+    ) -> u16x16<Self> {
+        if SHIFT == 0 {
+            return a;
+        }
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let a = self.cvt_to_bytes_u16x16(a).val.0;
+        let b = self.cvt_to_bytes_u16x16(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 2usize);
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_add_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_sub_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_mullo_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, shift: u32) -> u16x16<Avx512> {
+                _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_sllv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, shift: u32) -> u16x16<Avx512> {
+                _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_srlv_epi16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpeq_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmplt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmple_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpge_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
+                mask16x16 {
+                    val: _mm256_cmpgt_epu16_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_permutex2var_epi16(
+                    a.into(),
+                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x16<Avx512>,
+                b: u16x16<Avx512>,
+            ) -> (u16x16<Avx512>, u16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: u16x16<Avx512>,
+                b: u16x16<Avx512>,
+            ) -> (u16x16<Avx512>, u16x16<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm256_permutex2var_epi16(
+                        a,
+                        _mm256_setr_epi16(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask16x16<Avx512>,
+                b: u16x16<Avx512>,
+                c: u16x16<Avx512>,
+            ) -> u16x16<Avx512> {
+                _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_min_epu16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+                _mm256_max_epu16(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x32<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>) -> (u16x8<Avx512>, u16x8<Avx512>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>) -> u8x16<Avx512> {
+                _mm256_cvtepi16_epi8(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>) -> u8x32<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x16<Avx512>) -> u32x8<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
+        mask16x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i16; 16usize]) -> mask16x16<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask16x16 {
+                    val: _mm256_movepi16_mask(lanes),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask16x16<Avx512>) -> [i16; 16usize] {
+                let lanes = _mm256_movm_epi16(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        mask16x16 {
+            val: (bits & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
+    }
+    #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask16x16 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn select_mask16x16(
+        self,
+        a: mask16x16<Self>,
+        b: mask16x16<Self>,
+        c: mask16x16<Self>,
+    ) -> mask16x16<Self> {
+        mask16x16 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        mask16x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 0
+    }
+    #[inline(always)]
+    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
+    }
+    #[inline(always)]
+    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
+    }
+    #[inline(always)]
+    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 0
+    }
+    #[inline(always)]
+    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64;
+        mask16x32 {
+            val: bits as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask16x8 {
+                val: (bits & 255u64) as _,
+                simd: self,
+            },
+            mask16x8 {
+                val: ((bits >> 8usize) & 255u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: i32) -> i32x8<Avx512> {
+                _mm256_set1_epi32(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: i32x8<Avx512>,
+                b: i32x8<Avx512>,
+                shift: usize,
+            ) -> i32x8<Avx512> {
+                if shift >= 8usize {
+                    return b;
+                }
+                let idx = _mm256_add_epi8(
+                    _mm256_setr_epi8(
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                    ),
+                    _mm256_set1_epi8((shift * 4usize) as i8),
+                );
+                let result = _mm256_permutex2var_epi8(
+                    token.cvt_to_bytes_i32x8(a).val.0,
+                    idx,
+                    token.cvt_to_bytes_i32x8(b).val.0,
+                );
+                token.cvt_from_bytes_i32x8(u8x32 {
+                    val: crate::support::Aligned256(result),
+                    simd: token,
+                })
+            }
+        );
+        kernel(self, a, b, SHIFT)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x8<const SHIFT: usize>(
+        self,
+        a: i32x8<Self>,
+        b: i32x8<Self>,
+    ) -> i32x8<Self> {
+        if SHIFT == 0 {
+            return a;
+        }
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let a = self.cvt_to_bytes_i32x8(a).val.0;
+        let b = self.cvt_to_bytes_i32x8(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 4usize);
+        self.cvt_from_bytes_i32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_add_epi32(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_sub_epi32(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_mullo_epi32(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, shift: u32) -> i32x8<Avx512> {
+                _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_sllv_epi32(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, shift: u32) -> i32x8<Avx512> {
+                _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_srav_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmple_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpeq_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4962,12 +8027,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmpge_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmplt_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4975,12 +8040,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmpgt_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmple_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -4988,16 +8053,39 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpge_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpgt_epi32_mask(a.into(), b.into()),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
-                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-                    ),
+                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5006,16 +8094,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
-                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
-                    ),
+                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5024,16 +8109,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
-                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
-                    ),
+                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5042,16 +8124,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
-                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
-                    ),
+                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5060,125 +8139,97 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i8x32<Avx512>,
-                b: i8x32<Avx512>,
-            ) -> (i8x32<Avx512>, i8x32<Avx512>) {
+                a: i32x8<Avx512>,
+                b: i32x8<Avx512>,
+            ) -> (i32x8<Avx512>, i32x8<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41,
-                            10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56,
-                            25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i8x32<Avx512>,
-                b: i8x32<Avx512>,
-            ) -> (i8x32<Avx512>, i8x32<Avx512>) {
+                a: i32x8<Avx512>,
+                b: i32x8<Avx512>,
+            ) -> (i32x8<Avx512>, i32x8<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36,
-                            38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37,
-                            39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
+    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask8x32<Avx512>,
-                b: i8x32<Avx512>,
-                c: i8x32<Avx512>,
-            ) -> i8x32<Avx512> {
-                _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+                a: mask32x8<Avx512>,
+                b: i32x8<Avx512>,
+                c: i32x8<Avx512>,
+            ) -> i32x8<Avx512> {
+                _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_min_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_min_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_max_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_max_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>, b: i8x32<Avx512>) -> i8x64<Avx512> {
+            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x16<Avx512> {
                 _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>) -> (i8x16<Avx512>, i8x16<Avx512>) {
+            fn kernel(token: Avx512, a: i32x8<Avx512>) -> (i32x4<Avx512>, i32x4<Avx512>) {
                 (
                     _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
                     _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
@@ -5188,100 +8239,110 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>) -> i8x32<Avx512> {
-                _mm256_sub_epi8(_mm256_setzero_si256(), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x8<Avx512>) -> i32x8<Avx512> {
+                _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>) -> u8x32<Avx512> {
+            fn kernel(token: Avx512, a: i32x8<Avx512>) -> u8x32<Avx512> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x32<Avx512>) -> u32x8<Avx512> {
+            fn kernel(token: Avx512, a: i32x8<Avx512>) -> u32x8<Avx512> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: u8) -> u8x32<Avx512> {
-                _mm256_set1_epi8(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x8<Avx512>) -> f32x8<Avx512> {
+                _mm256_cvtepi32_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: u32) -> u32x8<Avx512> {
+                _mm256_set1_epi32(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [u8; 32usize]>(&a.val.0)
+    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [u8; 32usize]>(&a.val.0)
+    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [u8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
+        u32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u8x32<Avx512>,
-                b: u8x32<Avx512>,
+                a: u32x8<Avx512>,
+                b: u32x8<Avx512>,
                 shift: usize,
-            ) -> u8x32<Avx512> {
-                if shift >= 32usize {
+            ) -> u32x8<Avx512> {
+                if shift >= 8usize {
                     return b;
                 }
                 let idx = _mm256_add_epi8(
@@ -5289,14 +8350,14 @@ impl Simd for Avx512 {
                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                         21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
                     ),
-                    _mm256_set1_epi8((shift) as i8),
+                    _mm256_set1_epi8((shift * 4usize) as i8),
                 );
                 let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_u8x32(a).val.0,
+                    token.cvt_to_bytes_u32x8(a).val.0,
                     idx,
-                    token.cvt_to_bytes_u8x32(b).val.0,
+                    token.cvt_to_bytes_u32x8(b).val.0,
                 );
-                token.cvt_from_bytes_u8x32(u8x32 {
+                token.cvt_from_bytes_u32x8(u8x32 {
                     val: crate::support::Aligned256(result),
                     simd: token,
                 })
@@ -5305,183 +8366,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+    fn slide_within_blocks_u32x8<const SHIFT: usize>(
         self,
-        a: u8x32<Self>,
-        b: u8x32<Self>,
-    ) -> u8x32<Self> {
+        a: u32x8<Self>,
+        b: u32x8<Self>,
+    ) -> u32x8<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 16usize {
+        if SHIFT >= 4usize {
             return b;
         }
-        let a = self.cvt_to_bytes_u8x32(a).val.0;
-        let b = self.cvt_to_bytes_u8x32(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT);
-        self.cvt_from_bytes_u8x32(u8x32 {
+        let a = self.cvt_to_bytes_u32x8(a).val.0;
+        let b = self.cvt_to_bytes_u32x8(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 4usize);
+        self.cvt_from_bytes_u32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_add_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_add_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_sub_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_sub_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                let dst_even = _mm256_mullo_epi16(a.into(), b.into());
-                let dst_odd = _mm256_mullo_epi16(
-                    _mm256_srli_epi16::<8>(a.into()),
-                    _mm256_srli_epi16::<8>(b.into()),
-                );
-                _mm256_or_si256(
-                    _mm256_slli_epi16(dst_odd, 8),
-                    _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_mullo_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, shift: u32) -> u8x32<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
-                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
-                let lo_shifted = _mm256_sll_epi16(lo_16, shift_count);
-                let hi_shifted = _mm256_sll_epi16(hi_16, shift_count);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, shift: u32) -> u32x8<Avx512> {
+                _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm256_setzero_si256();
-                let value_extend = zero;
-                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm256_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm256_and_si256(_mm256_sllv_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm256_and_si256(_mm256_sllv_epi16(hi_values, hi_counts), byte_mask);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_sllv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, shift: u32) -> u8x32<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm256_unpacklo_epi8(val, _mm256_setzero_si256());
-                let hi_16 = _mm256_unpackhi_epi8(val, _mm256_setzero_si256());
-                let lo_shifted = _mm256_srl_epi16(lo_16, shift_count);
-                let hi_shifted = _mm256_srl_epi16(hi_16, shift_count);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, shift: u32) -> u32x8<Avx512> {
+                _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm256_setzero_si256();
-                let value_extend = zero;
-                let lo_values = _mm256_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm256_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm256_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm256_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm256_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm256_and_si256(_mm256_srlv_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm256_and_si256(_mm256_srlv_epi16(hi_values, hi_counts), byte_mask);
-                _mm256_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_srlv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmpeq_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpeq_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -5489,12 +8503,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmplt_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmplt_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -5502,12 +8516,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmple_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmple_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -5515,12 +8529,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmpge_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpge_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -5528,12 +8542,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> mask8x32<Avx512> {
-                mask8x32 {
-                    val: _mm256_cmpgt_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
+                mask32x8 {
+                    val: _mm256_cmpgt_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -5541,16 +8555,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10,
-                        42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-                    ),
+                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5559,16 +8570,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25,
-                        57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
-                    ),
+                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5577,16 +8585,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38,
-                        40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
-                    ),
+                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5595,16 +8600,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_permutex2var_epi32(
                     a.into(),
-                    _mm256_setr_epi8(
-                        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39,
-                        41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
-                    ),
+                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
                     b.into(),
                 )
                 .simd_into(token)
@@ -5613,125 +8615,97 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u8x32<Avx512>,
-                b: u8x32<Avx512>,
-            ) -> (u8x32<Avx512>, u8x32<Avx512>) {
-                let a = a.into();
-                let b = b.into();
-                (
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41,
-                            10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56,
-                            25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                a: u32x8<Avx512>,
+                b: u32x8<Avx512>,
+            ) -> (u32x8<Avx512>, u32x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u8x32<Avx512>,
-                b: u8x32<Avx512>,
-            ) -> (u8x32<Avx512>, u8x32<Avx512>) {
+                a: u32x8<Avx512>,
+                b: u32x8<Avx512>,
+            ) -> (u32x8<Avx512>, u32x8<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36,
-                            38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi8(
-                        a,
-                        _mm256_setr_epi8(
-                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37,
-                            39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask8x32<Avx512>,
-                b: u8x32<Avx512>,
-                c: u8x32<Avx512>,
-            ) -> u8x32<Avx512> {
-                _mm256_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+                a: mask32x8<Avx512>,
+                b: u32x8<Avx512>,
+                c: u32x8<Avx512>,
+            ) -> u32x8<Avx512> {
+                _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_min_epu8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_min_epu32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x32<Avx512> {
-                _mm256_max_epu8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
+                _mm256_max_epu32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>, b: u8x32<Avx512>) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x16<Avx512> {
                 _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>) -> (u8x16<Avx512>, u8x16<Avx512>) {
+            fn kernel(token: Avx512, a: u32x8<Avx512>) -> (u32x4<Avx512>, u32x4<Avx512>) {
                 (
                     _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
                     _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
@@ -5741,40 +8715,41 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_cvtepu8_epi16(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>) -> u8x32<Avx512> {
+                __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x32<Avx512>) -> u32x8<Avx512> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx512, a: u32x8<Avx512>) -> f32x8<Avx512> {
+                _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_zextsi256_si512(a.into())))
+                    .simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
-        mask8x32 {
-            val: (if val { 4294967295u64 } else { 0 }) as _,
+    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
+        mask32x8 {
+            val: (if val { 255u64 } else { 0 }) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: [i8; 32usize]) -> mask8x32<Avx512> {
+            fn kernel(token: Avx512, val: [i32; 8usize]) -> mask32x8<Avx512> {
                 let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask8x32 {
-                    val: _mm256_movepi8_mask(lanes),
+                mask32x8 {
+                    val: _mm256_movepi32_mask(lanes),
                     simd: token,
                 }
             }
@@ -5782,198 +8757,198 @@ impl Simd for Avx512 {
         kernel(self, val)
     }
     #[inline(always)]
-    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: mask8x32<Avx512>) -> [i8; 32usize] {
-                let lanes = _mm256_movm_epi8(a.val);
+            fn kernel(token: Avx512, a: mask32x8<Avx512>) -> [i32; 8usize] {
+                let lanes = _mm256_movm_epi32(a.val);
                 crate::transmute::checked_transmute_copy(&lanes)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
-        mask8x32 {
-            val: (bits & 4294967295u64) as _,
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        mask32x8 {
+            val: (bits & 255u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
-        u64::from((a).val) & 4294967295u64
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        u64::from((a).val) & 255u64
     }
     #[inline(always)]
-    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 32usize,
+            index < 8usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            32usize
+            8usize
         );
         let bit = 1u64 << index;
         let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask8x32 {
+        *a = mask32x8 {
             val: (bits) as _,
             simd: self,
         };
     }
     #[inline(always)]
-    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        mask8x32 {
-            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
+    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        mask8x32 {
-            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
+    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        mask8x32 {
-            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
+    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
-        mask8x32 {
-            val: ((!u64::from((a).val)) & 4294967295u64) as _,
+    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: ((!u64::from((a).val)) & 255u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn select_mask8x32(
+    fn select_mask32x8(
         self,
-        a: mask8x32<Self>,
-        b: mask8x32<Self>,
-        c: mask8x32<Self>,
-    ) -> mask8x32<Self> {
-        mask8x32 {
+        a: mask32x8<Self>,
+        b: mask32x8<Self>,
+        c: mask32x8<Self>,
+    ) -> mask32x8<Self> {
+        mask32x8 {
             val: (((u64::from((a).val) & u64::from((b).val))
                 | ((!u64::from((a).val)) & u64::from((c).val)))
-                & 4294967295u64) as _,
+                & 255u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        mask8x32 {
-            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
+    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        mask32x8 {
+            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
+    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
         bits != 0
     }
     #[inline(always)]
-    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
-        bits == 4294967295u64
+    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits == 255u64
     }
     #[inline(always)]
-    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
-        bits != 4294967295u64
+    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
+        bits != 255u64
     }
     #[inline(always)]
-    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
+    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let bits = u64::from((a).val) & 255u64;
         bits == 0
     }
     #[inline(always)]
-    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
-        let bits = (u64::from(a.val) | (u64::from(b.val) << 32usize)) & u64::MAX;
-        mask8x64 {
-            val: bits,
+    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64;
+        mask32x16 {
+            val: bits as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
+    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
         let bits = u64::from(a.val);
         (
-            mask8x16 {
-                val: (bits & 65535u64) as _,
+            mask32x4 {
+                val: (bits & 15u64) as _,
                 simd: self,
             },
-            mask8x16 {
-                val: ((bits >> 16usize) & 65535u64) as _,
+            mask32x4 {
+                val: ((bits >> 4usize) & 15u64) as _,
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: i16) -> i16x16<Avx512> {
-                _mm256_set1_epi16(val).simd_into(token)
+            fn kernel(token: Avx512, val: f64) -> f64x4<Avx512> {
+                _mm256_set1_pd(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
+    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
+    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i16; 16usize]>(&a.val.0)
+    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [i16; 16usize]>(&a.val.0)
+    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
+        crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [i16; 16usize]>(&mut a.val.0)
+    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
+        crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
+    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
-        i16x16 {
+    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i16x16<Avx512>,
-                b: i16x16<Avx512>,
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
                 shift: usize,
-            ) -> i16x16<Avx512> {
-                if shift >= 16usize {
+            ) -> f64x4<Avx512> {
+                if shift >= 4usize {
                     return b;
                 }
                 let idx = _mm256_add_epi8(
@@ -5981,14 +8956,14 @@ impl Simd for Avx512 {
                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                         21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
                     ),
-                    _mm256_set1_epi8((shift * 2usize) as i8),
+                    _mm256_set1_epi8((shift * 8usize) as i8),
                 );
                 let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_i16x16(a).val.0,
+                    token.cvt_to_bytes_f64x4(a).val.0,
                     idx,
-                    token.cvt_to_bytes_i16x16(b).val.0,
+                    token.cvt_to_bytes_f64x4(b).val.0,
                 );
-                token.cvt_from_bytes_i16x16(u8x32 {
+                token.cvt_from_bytes_f64x4(u8x32 {
                     val: crate::support::Aligned256(result),
                     simd: token,
                 })
@@ -5997,136 +8972,140 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x16<const SHIFT: usize>(
+    fn slide_within_blocks_f64x4<const SHIFT: usize>(
         self,
-        a: i16x16<Self>,
-        b: i16x16<Self>,
-    ) -> i16x16<Self> {
+        a: f64x4<Self>,
+        b: f64x4<Self>,
+    ) -> f64x4<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 8usize {
+        if SHIFT >= 2usize {
             return b;
         }
-        let a = self.cvt_to_bytes_i16x16(a).val.0;
-        let b = self.cvt_to_bytes_i16x16(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT * 2usize);
-        self.cvt_from_bytes_i16x16(u8x32 {
+        let a = self.cvt_to_bytes_f64x4(a).val.0;
+        let b = self.cvt_to_bytes_f64x4(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 8usize);
+        self.cvt_from_bytes_f64x4(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_add_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_sub_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_mullo_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_sqrt_pd(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_rcp14_pd(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_add_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_sub_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        a ^ !0
-    }
-    #[inline(always)]
-    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, shift: u32) -> i16x16<Avx512> {
-                _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_mul_pd(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_sllv_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_div_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, shift: u32) -> i16x16<Avx512> {
-                _mm256_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                let mask = _mm256_set1_pd(-0.0);
+                _mm256_or_pd(
+                    _mm256_and_pd(mask, b.into()),
+                    _mm256_andnot_pd(mask, a.into()),
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_srav_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmpeq_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6134,12 +9113,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmplt_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6147,12 +9126,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmple_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6160,12 +9139,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmpge_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6173,295 +9152,318 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmpgt_epi16_mask(a.into(), b.into()),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b.into(),
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
+            ) -> (f64x4<Avx512>, f64x4<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(token),
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(token),
                 )
-                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b.into(),
+            fn kernel(
+                token: Avx512,
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
+            ) -> (f64x4<Avx512>, f64x4<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(token),
+                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(token),
                 )
-                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_max_pd(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_min_pd(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i16x16<Avx512>,
-                b: i16x16<Avx512>,
-            ) -> (i16x16<Avx512>, i16x16<Avx512>) {
-                let a = a.into();
-                let b = b.into();
-                (
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(
-                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                )
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
+                c: f64x4<Avx512>,
+            ) -> f64x4<Avx512> {
+                _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i16x16<Avx512>,
-                b: i16x16<Avx512>,
-            ) -> (i16x16<Avx512>, i16x16<Avx512>) {
-                let a = a.into();
-                let b = b.into();
-                (
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(
-                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(
-                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                )
+                a: f64x4<Avx512>,
+                b: f64x4<Avx512>,
+                c: f64x4<Avx512>,
+            ) -> f64x4<Avx512> {
+                _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
+    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: mask16x16<Avx512>,
-                b: i16x16<Avx512>,
-                c: i16x16<Avx512>,
-            ) -> i16x16<Avx512> {
-                _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_min_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_max_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        a - self.trunc_f64x4(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>, b: i16x16<Avx512>) -> i16x32<Avx512> {
-                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
+                _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>) -> (i16x8<Avx512>, i16x8<Avx512>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(
+                token: Avx512,
+                a: mask64x4<Avx512>,
+                b: f64x4<Avx512>,
+                c: f64x4<Avx512>,
+            ) -> f64x4<Avx512> {
+                _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>) -> i16x16<Avx512> {
-                _mm256_sub_epi16(_mm256_setzero_si256(), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x8<Avx512> {
+                _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>) -> u8x32<Avx512> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> (f64x2<Avx512>, f64x2<Avx512>) {
+                (
+                    _mm256_extractf128_pd::<0>(a.into()).simd_into(token),
+                    _mm256_extractf128_pd::<1>(a.into()).simd_into(token),
+                )
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x16<Avx512>) -> u32x8<Avx512> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f32x8<Avx512> {
+                _mm256_castpd_ps(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+    fn splat_i64x4(self, val: i64) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: u16) -> u16x16<Avx512> {
-                _mm256_set1_epi16(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx512, val: i64) -> i64x4<Avx512> {
+                _mm256_set1_epi64x(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [u16; 16usize]>(&a.val.0)
+    fn as_array_i64x4(self, a: i64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [u16; 16usize]>(&a.val.0)
+    fn as_array_ref_i64x4(self, a: &i64x4<Self>) -> &[i64; 4usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [u16; 16usize]>(&mut a.val.0)
+    fn as_array_mut_i64x4(self, a: &mut i64x4<Self>) -> &mut [i64; 4usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [i64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+    fn store_array_i64x4(self, a: i64x4<Self>, dest: &mut [i64; 4usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
-        u16x16 {
+    fn cvt_from_bytes_i64x4(self, a: u8x32<Self>) -> i64x4<Self> {
+        i64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn slide_i64x4<const SHIFT: usize>(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u16x16<Avx512>,
-                b: u16x16<Avx512>,
+                a: i64x4<Avx512>,
+                b: i64x4<Avx512>,
                 shift: usize,
-            ) -> u16x16<Avx512> {
-                if shift >= 16usize {
+            ) -> i64x4<Avx512> {
+                if shift >= 4usize {
                     return b;
                 }
                 let idx = _mm256_add_epi8(
@@ -6469,14 +9471,14 @@ impl Simd for Avx512 {
                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                         21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
                     ),
-                    _mm256_set1_epi8((shift * 2usize) as i8),
+                    _mm256_set1_epi8((shift * 8usize) as i8),
                 );
                 let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_u16x16(a).val.0,
+                    token.cvt_to_bytes_i64x4(a).val.0,
                     idx,
-                    token.cvt_to_bytes_u16x16(b).val.0,
+                    token.cvt_to_bytes_i64x4(b).val.0,
                 );
-                token.cvt_from_bytes_u16x16(u8x32 {
+                token.cvt_from_bytes_i64x4(u8x32 {
                     val: crate::support::Aligned256(result),
                     simd: token,
                 })
@@ -6485,136 +9487,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+    fn slide_within_blocks_i64x4<const SHIFT: usize>(
         self,
-        a: u16x16<Self>,
-        b: u16x16<Self>,
-    ) -> u16x16<Self> {
+        a: i64x4<Self>,
+        b: i64x4<Self>,
+    ) -> i64x4<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 8usize {
+        if SHIFT >= 2usize {
             return b;
         }
-        let a = self.cvt_to_bytes_u16x16(a).val.0;
-        let b = self.cvt_to_bytes_u16x16(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT * 2usize);
-        self.cvt_from_bytes_u16x16(u8x32 {
+        let a = self.cvt_to_bytes_i64x4(a).val.0;
+        let b = self.cvt_to_bytes_i64x4(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 8usize);
+        self.cvt_from_bytes_i64x4(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn add_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_add_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn sub_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_sub_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn mul_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_mullo_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_mullo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn and_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn or_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn xor_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+    fn not_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+    fn shl_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, shift: u32) -> u16x16<Avx512> {
-                _mm256_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, shift: u32) -> i64x4<Avx512> {
+                _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn shlv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_sllv_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+    fn shr_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, shift: u32) -> u16x16<Avx512> {
-                _mm256_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, shift: u32) -> i64x4<Avx512> {
+                _mm256_sra_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn shrv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_srlv_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_srav_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_eq_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmpeq_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmpeq_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6622,12 +9624,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_lt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmplt_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmplt_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6635,12 +9637,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_le_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmple_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmple_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6648,12 +9650,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_ge_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmpge_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmpge_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6661,12 +9663,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+    fn simd_gt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> mask16x16<Avx512> {
-                mask16x16 {
-                    val: _mm256_cmpgt_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmpgt_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -6674,179 +9676,141 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn zip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn zip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
-                    b.into(),
-                )
-                .simd_into(token)
+            #[inline(always)]
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn unzip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn unzip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_permutex2var_epi16(
-                    a.into(),
-                    _mm256_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn interleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u16x16<Avx512>,
-                b: u16x16<Avx512>,
-            ) -> (u16x16<Avx512>, u16x16<Avx512>) {
+                a: i64x4<Avx512>,
+                b: i64x4<Avx512>,
+            ) -> (i64x4<Avx512>, i64x4<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(
-                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 4, 1, 5), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(2, 6, 3, 7), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn deinterleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u16x16<Avx512>,
-                b: u16x16<Avx512>,
-            ) -> (u16x16<Avx512>, u16x16<Avx512>) {
+                a: i64x4<Avx512>,
+                b: i64x4<Avx512>,
+            ) -> (i64x4<Avx512>, i64x4<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(
-                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm256_permutex2var_epi16(
-                        a,
-                        _mm256_setr_epi16(
-                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 2, 4, 6), b)
+                        .simd_into(token),
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(1, 3, 5, 7), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+    fn select_i64x4(self, a: mask64x4<Self>, b: i64x4<Self>, c: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask16x16<Avx512>,
-                b: u16x16<Avx512>,
-                c: u16x16<Avx512>,
-            ) -> u16x16<Avx512> {
-                _mm256_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+                a: mask64x4<Avx512>,
+                b: i64x4<Avx512>,
+                c: i64x4<Avx512>,
+            ) -> i64x4<Avx512> {
+                _mm256_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn min_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_min_epu16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_min_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn max_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x16<Avx512> {
-                _mm256_max_epu16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_max_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+    fn combine_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>, b: u16x16<Avx512>) -> u16x32<Avx512> {
+            fn kernel(token: Avx512, a: i64x4<Avx512>, b: i64x4<Avx512>) -> i64x8<Avx512> {
                 _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+    fn split_i64x4(self, a: i64x4<Self>) -> (i64x2<Self>, i64x2<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>) -> (u16x8<Avx512>, u16x8<Avx512>) {
+            fn kernel(token: Avx512, a: i64x4<Avx512>) -> (i64x2<Avx512>, i64x2<Avx512>) {
                 (
                     _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
                     _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
@@ -6856,249 +9820,100 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+    fn neg_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>) -> u8x16<Avx512> {
-                _mm256_cvtepi16_epi8(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x4<Avx512>) -> i64x4<Avx512> {
+                _mm256_sub_epi64(_mm256_setzero_si256(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+    fn reinterpret_u8_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>) -> u8x32<Avx512> {
+            fn kernel(token: Avx512, a: i64x4<Avx512>) -> u8x32<Avx512> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+    fn reinterpret_u32_i64x4(self, a: i64x4<Self>) -> u32x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x16<Avx512>) -> u32x8<Avx512> {
+            fn kernel(token: Avx512, a: i64x4<Avx512>) -> u32x8<Avx512> {
                 __m256i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
-        mask16x16 {
-            val: (if val { 65535u64 } else { 0 }) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, val: [i16; 16usize]) -> mask16x16<Avx512> {
-                let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask16x16 {
-                    val: _mm256_movepi16_mask(lanes),
-                    simd: token,
-                }
-            }
-        );
-        kernel(self, val)
-    }
-    #[inline(always)]
-    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: mask16x16<Avx512>) -> [i16; 16usize] {
-                let lanes = _mm256_movm_epi16(a.val);
-                crate::transmute::checked_transmute_copy(&lanes)
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
-        mask16x16 {
-            val: (bits & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
-        u64::from((a).val) & 65535u64
-    }
-    #[inline(always)]
-    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let bit = 1u64 << index;
-        let bits = u64::from((a).val);
-        let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask16x16 {
-            val: (bits) as _,
-            simd: self,
-        };
-    }
-    #[inline(always)]
-    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        mask16x16 {
-            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        mask16x16 {
-            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        mask16x16 {
-            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
-        mask16x16 {
-            val: ((!u64::from((a).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn select_mask16x16(
-        self,
-        a: mask16x16<Self>,
-        b: mask16x16<Self>,
-        c: mask16x16<Self>,
-    ) -> mask16x16<Self> {
-        mask16x16 {
-            val: (((u64::from((a).val) & u64::from((b).val))
-                | ((!u64::from((a).val)) & u64::from((c).val)))
-                & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        mask16x16 {
-            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits != 0
-    }
-    #[inline(always)]
-    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits == 65535u64
-    }
-    #[inline(always)]
-    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits != 65535u64
-    }
-    #[inline(always)]
-    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits == 0
-    }
-    #[inline(always)]
-    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
-        let bits = (u64::from(a.val) | (u64::from(b.val) << 16usize)) & 4294967295u64;
-        mask16x32 {
-            val: bits as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
-        let bits = u64::from(a.val);
-        (
-            mask16x8 {
-                val: (bits & 255u64) as _,
-                simd: self,
-            },
-            mask16x8 {
-                val: ((bits >> 8usize) & 255u64) as _,
-                simd: self,
-            },
-        )
-    }
-    #[inline(always)]
-    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+    fn splat_u64x4(self, val: u64) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: i32) -> i32x8<Avx512> {
-                _mm256_set1_epi32(val).simd_into(token)
+            fn kernel(token: Avx512, val: u64) -> u64x4<Avx512> {
+                _mm256_set1_epi64x(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
+    fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
+    fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [i32; 8usize]>(&a.val.0)
+    fn as_array_u64x4(self, a: u64x4<Self>) -> [u64; 4usize] {
+        crate::transmute::checked_transmute_copy::<__m256i, [u64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [i32; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x4(self, a: &u64x4<Self>) -> &[u64; 4usize] {
+        crate::transmute::checked_cast_ref::<__m256i, [u64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [i32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x4(self, a: &mut u64x4<Self>) -> &mut [u64; 4usize] {
+        crate::transmute::checked_cast_mut::<__m256i, [u64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
+    fn store_array_u64x4(self, a: u64x4<Self>, dest: &mut [u64; 4usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
-        i32x8 {
+    fn cvt_from_bytes_u64x4(self, a: u8x32<Self>) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn slide_u64x4<const SHIFT: usize>(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
-                token: Avx512,
-                a: i32x8<Avx512>,
-                b: i32x8<Avx512>,
+                token: Avx512,
+                a: u64x4<Avx512>,
+                b: u64x4<Avx512>,
                 shift: usize,
-            ) -> i32x8<Avx512> {
-                if shift >= 8usize {
+            ) -> u64x4<Avx512> {
+                if shift >= 4usize {
                     return b;
                 }
                 let idx = _mm256_add_epi8(
@@ -7106,14 +9921,14 @@ impl Simd for Avx512 {
                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                         21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
                     ),
-                    _mm256_set1_epi8((shift * 4usize) as i8),
+                    _mm256_set1_epi8((shift * 8usize) as i8),
                 );
                 let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_i32x8(a).val.0,
+                    token.cvt_to_bytes_u64x4(a).val.0,
                     idx,
-                    token.cvt_to_bytes_i32x8(b).val.0,
+                    token.cvt_to_bytes_u64x4(b).val.0,
                 );
-                token.cvt_from_bytes_i32x8(u8x32 {
+                token.cvt_from_bytes_u64x4(u8x32 {
                     val: crate::support::Aligned256(result),
                     simd: token,
                 })
@@ -7122,136 +9937,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x4<const SHIFT: usize>(
         self,
-        a: i32x8<Self>,
-        b: i32x8<Self>,
-    ) -> i32x8<Self> {
+        a: u64x4<Self>,
+        b: u64x4<Self>,
+    ) -> u64x4<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 4usize {
+        if SHIFT >= 2usize {
             return b;
         }
-        let a = self.cvt_to_bytes_i32x8(a).val.0;
-        let b = self.cvt_to_bytes_i32x8(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT * 4usize);
-        self.cvt_from_bytes_i32x8(u8x32 {
+        let a = self.cvt_to_bytes_u64x4(a).val.0;
+        let b = self.cvt_to_bytes_u64x4(b).val.0;
+        let result = dyn_alignr_256(self, b, a, SHIFT * 8usize);
+        self.cvt_from_bytes_u64x4(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn add_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_add_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn sub_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_sub_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn mul_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_mullo_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_mullo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn and_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
                 _mm256_and_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn or_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
                 _mm256_or_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn xor_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
                 _mm256_xor_si256(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+    fn not_u64x4(self, a: u64x4<Self>) -> u64x4<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+    fn shl_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, shift: u32) -> i32x8<Avx512> {
-                _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, shift: u32) -> u64x4<Avx512> {
+                _mm256_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn shlv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_sllv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+    fn shr_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, shift: u32) -> i32x8<Avx512> {
-                _mm256_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, shift: u32) -> u64x4<Avx512> {
+                _mm256_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn shrv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_srav_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_srlv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn simd_eq_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmpeq_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmpeq_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7259,12 +10074,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn simd_lt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmplt_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmplt_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7272,12 +10087,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn simd_le_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmple_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmple_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7285,12 +10100,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn simd_ge_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmpge_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmpge_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7298,12 +10113,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+    fn simd_gt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmpgt_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> mask64x4<Avx512> {
+                mask64x4 {
+                    val: _mm256_cmpgt_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7311,80 +10126,64 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn zip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_permutex2var_epi32(
-                    a.into(),
-                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn zip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_permutex2var_epi32(
-                    a.into(),
-                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn unzip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_permutex2var_epi32(
-                    a.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+    fn unzip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_permutex2var_epi32(
-                    a.into(),
-                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_permutex2var_epi64(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+    fn interleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i32x8<Avx512>,
-                b: i32x8<Avx512>,
-            ) -> (i32x8<Avx512>, i32x8<Avx512>) {
+                a: u64x4<Avx512>,
+                b: u64x4<Avx512>,
+            ) -> (u64x4<Avx512>, u64x4<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 4, 1, 5), b)
                         .simd_into(token),
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(2, 6, 3, 7), b)
                         .simd_into(token),
                 )
             }
@@ -7392,20 +10191,20 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+    fn deinterleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i32x8<Avx512>,
-                b: i32x8<Avx512>,
-            ) -> (i32x8<Avx512>, i32x8<Avx512>) {
+                a: u64x4<Avx512>,
+                b: u64x4<Avx512>,
+            ) -> (u64x4<Avx512>, u64x4<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(0, 2, 4, 6), b)
                         .simd_into(token),
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    _mm256_permutex2var_epi64(a, _mm256_setr_epi64x(1, 3, 5, 7), b)
                         .simd_into(token),
                 )
             }
@@ -7413,184 +10212,315 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: mask32x8<Avx512>,
-                b: i32x8<Avx512>,
-                c: i32x8<Avx512>,
-            ) -> i32x8<Avx512> {
-                _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b, c)
+    fn select_u64x4(self, a: mask64x4<Self>, b: u64x4<Self>, c: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: mask64x4<Avx512>,
+                b: u64x4<Avx512>,
+                c: u64x4<Avx512>,
+            ) -> u64x4<Avx512> {
+                _mm256_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_min_epu64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x4<Avx512> {
+                _mm256_max_epu64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn combine_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u64x4<Avx512>, b: u64x4<Avx512>) -> u64x8<Avx512> {
+                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn split_u64x4(self, a: u64x4<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u64x4<Avx512>) -> (u64x2<Avx512>, u64x2<Avx512>) {
+                (
+                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
+                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                )
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u64x4<Avx512>) -> u8x32<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x4(self, a: u64x4<Self>) -> u32x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u64x4<Avx512>) -> u32x8<Avx512> {
+                __m256i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+        mask64x4 {
+            val: (if val { 15u64 } else { 0 }) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: [i64; 4usize]) -> mask64x4<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask64x4 {
+                    val: _mm256_movepi64_mask(lanes),
+                    simd: token,
+                }
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: mask64x4<Avx512>) -> [i64; 4usize] {
+                let lanes = _mm256_movm_epi64(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        mask64x4 {
+            val: (bits & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        u64::from((a).val) & 15u64
+    }
+    #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask64x4 {
+            val: (bits) as _,
+            simd: self,
+        };
+    }
+    #[inline(always)]
+    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: ((!u64::from((a).val)) & 15u64) as _,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_min_epi32(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn select_mask64x4(
+        self,
+        a: mask64x4<Self>,
+        b: mask64x4<Self>,
+        c: mask64x4<Self>,
+    ) -> mask64x4<Self> {
+        mask64x4 {
+            val: (((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & 15u64) as _,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_max_epi32(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: (!u64::from(a.val ^ b.val) & 15u64) as _,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>, b: i32x8<Avx512>) -> i32x16<Avx512> {
-                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
+    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 0
     }
     #[inline(always)]
-    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>) -> (i32x4<Avx512>, i32x4<Avx512>) {
-                (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
-                )
-            }
-        );
-        kernel(self, a)
+    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 15u64
     }
     #[inline(always)]
-    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>) -> i32x8<Avx512> {
-                _mm256_sub_epi32(_mm256_setzero_si256(), a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits != 15u64
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>) -> u8x32<Avx512> {
-                __m256i::from(a).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let bits = u64::from((a).val) & 15u64;
+        bits == 0
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>) -> u32x8<Avx512> {
-                __m256i::from(a).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
+        let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64;
+        mask64x8 {
+            val: bits as _,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: i32x8<Avx512>) -> f32x8<Avx512> {
-                _mm256_cvtepi32_ps(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask64x2 {
+                val: (bits & 3u64) as _,
+                simd: self,
+            },
+            mask64x2 {
+                val: ((bits >> 2usize) & 3u64) as _,
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: u32) -> u32x8<Avx512> {
-                _mm256_set1_epi32(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx512, val: f32) -> f32x16<Avx512> {
+                _mm512_set1_ps(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m256i, [u32; 8usize]>(&a.val.0)
+    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m512, [f32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
-        crate::transmute::checked_cast_ref::<__m256i, [u32; 8usize]>(&a.val.0)
+    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
+        crate::transmute::checked_cast_ref::<__m512, [f32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
-        crate::transmute::checked_cast_mut::<__m256i, [u32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
+        crate::transmute::checked_cast_mut::<__m512, [f32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
-        u32x8 {
+    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
+        f32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u32x8<Avx512>,
-                b: u32x8<Avx512>,
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
                 shift: usize,
-            ) -> u32x8<Avx512> {
-                if shift >= 8usize {
+            ) -> f32x16<Avx512> {
+                if shift >= 16usize {
                     return b;
                 }
-                let idx = _mm256_add_epi8(
-                    _mm256_setr_epi8(
-                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
                     ),
-                    _mm256_set1_epi8((shift * 4usize) as i8),
+                    _mm512_set1_epi8((shift * 4usize) as i8),
                 );
-                let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_u32x8(a).val.0,
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_f32x16(a).val.0,
                     idx,
-                    token.cvt_to_bytes_u32x8(b).val.0,
+                    token.cvt_to_bytes_f32x16(b).val.0,
                 );
-                token.cvt_from_bytes_u32x8(u8x32 {
-                    val: crate::support::Aligned256(result),
+                token.cvt_from_bytes_f32x16(u8x64 {
+                    val: crate::support::Aligned512(result),
                     simd: token,
                 })
             }
@@ -7598,136 +10528,127 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+    fn slide_within_blocks_f32x16<const SHIFT: usize>(
         self,
-        a: u32x8<Self>,
-        b: u32x8<Self>,
-    ) -> u32x8<Self> {
+        a: f32x16<Self>,
+        b: f32x16<Self>,
+    ) -> f32x16<Self> {
         if SHIFT == 0 {
             return a;
         }
         if SHIFT >= 4usize {
             return b;
         }
-        let a = self.cvt_to_bytes_u32x8(a).val.0;
-        let b = self.cvt_to_bytes_u32x8(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT * 4usize);
-        self.cvt_from_bytes_u32x8(u8x32 {
-            val: crate::support::Aligned256(result),
+        let a = self.cvt_to_bytes_f32x16(a).val.0;
+        let b = self.cvt_to_bytes_f32x16(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT * 4usize);
+        self.cvt_from_bytes_f32x16(u8x64 {
+            val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_add_epi32(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_sub_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_mullo_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_and_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_sqrt_ps(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_or_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_rcp14_ps(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_xor_si256(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_add_ps(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
-        a ^ !0
-    }
-    #[inline(always)]
-    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, shift: u32) -> u32x8<Avx512> {
-                _mm256_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_sub_ps(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_sllv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_mul_ps(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, shift: u32) -> u32x8<Avx512> {
-                _mm256_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_div_ps(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_srlv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                let mask = _mm512_set1_ps(-0.0);
+                _mm512_or_ps(
+                    _mm512_and_ps(mask, b.into()),
+                    _mm512_andnot_ps(mask, a.into()),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmpeq_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7735,12 +10656,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmplt_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7748,12 +10669,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmple_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7761,12 +10682,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmpge_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7774,12 +10695,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> mask32x8<Avx512> {
-                mask32x8 {
-                    val: _mm256_cmpgt_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -7787,13 +10708,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_permutex2var_epi32(
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
                     a.into(),
-                    _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11),
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
                     b.into(),
                 )
                 .simd_into(token)
@@ -7802,13 +10723,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_permutex2var_epi32(
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
                     a.into(),
-                    _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15),
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
                     b.into(),
                 )
                 .simd_into(token)
@@ -7817,13 +10738,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_permutex2var_epi32(
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
                     a.into(),
-                    _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14),
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
                     b.into(),
                 )
                 .simd_into(token)
@@ -7832,13 +10753,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_permutex2var_epi32(
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_permutex2var_ps(
                     a.into(),
-                    _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15),
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
                     b.into(),
                 )
                 .simd_into(token)
@@ -7847,356 +10768,423 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u32x8<Avx512>,
-                b: u32x8<Avx512>,
-            ) -> (u32x8<Avx512>, u32x8<Avx512>) {
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
+            ) -> (f32x16<Avx512>, f32x16<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 8, 1, 9, 2, 10, 3, 11), b)
-                        .simd_into(token),
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(4, 12, 5, 13, 6, 14, 7, 15), b)
-                        .simd_into(token),
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u32x8<Avx512>,
-                b: u32x8<Avx512>,
-            ) -> (u32x8<Avx512>, u32x8<Avx512>) {
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
+            ) -> (f32x16<Avx512>, f32x16<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14), b)
-                        .simd_into(token),
-                    _mm256_permutex2var_epi32(a, _mm256_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15), b)
-                        .simd_into(token),
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_ps(
+                        a,
+                        _mm512_setr_epi32(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_max_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_min_ps(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
+                c: f32x16<Avx512>,
+            ) -> f32x16<Avx512> {
+                _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f32x16<Avx512>,
+                b: f32x16<Avx512>,
+                c: f32x16<Avx512>,
+            ) -> f32x16<Avx512> {
+                _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: mask32x8<Avx512>,
-                b: u32x8<Avx512>,
-                c: u32x8<Avx512>,
-            ) -> u32x8<Avx512> {
-                _mm256_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_min_epu32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        a - self.trunc_f32x16(a)
+    }
+    #[inline(always)]
+    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x8<Avx512> {
-                _mm256_max_epu32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
+    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>, b: u32x8<Avx512>) -> u32x16<Avx512> {
-                _mm512_inserti64x4::<1>(_mm512_castsi256_si512(a.into()), b.into()).simd_into(token)
+            fn kernel(
+                token: Avx512,
+                a: mask32x16<Avx512>,
+                b: f32x16<Avx512>,
+                c: f32x16<Avx512>,
+            ) -> f32x16<Avx512> {
+                _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>) -> (u32x4<Avx512>, u32x4<Avx512>) {
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> (f32x8<Avx512>, f32x8<Avx512>) {
                 (
-                    _mm256_extracti128_si256::<0>(a.into()).simd_into(token),
-                    _mm256_extracti128_si256::<1>(a.into()).simd_into(token),
+                    _mm512_castps512_ps256(a.into()).simd_into(token),
+                    _mm512_extractf32x8_ps::<1>(a.into()).simd_into(token),
                 )
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>) -> u8x32<Avx512> {
-                __m256i::from(a).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f64x8<Avx512> {
+                _mm512_castps_pd(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x8<Avx512>) -> f32x8<Avx512> {
-                _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_zextsi256_si512(a.into())))
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_castps_si512(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
-        mask32x8 {
-            val: (if val { 255u64 } else { 0 }) as _,
-            simd: self,
-        }
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, src: &[f32; 16usize]) -> f32x16<Avx512> {
+                let lanes: __m512 =
+                    crate::transmute::checked_transmute_copy::<[f32; 16usize], __m512>(src);
+                _mm512_permutexvar_ps(
+                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                    lanes,
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
-    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: [i32; 8usize]) -> mask32x8<Avx512> {
-                let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask32x8 {
-                    val: _mm256_movepi32_mask(lanes),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, a: f32x16<Avx512>, dest: &mut [f32; 16usize]) -> () {
+                let lanes = _mm512_permutexvar_ps(
+                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
+                    a.into(),
+                );
+                crate::transmute::checked_transmute_store::<__m512, [f32; 16usize]>(lanes, dest);
             }
         );
-        kernel(self, val)
+        kernel(self, a, dest);
     }
     #[inline(always)]
-    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
+    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: mask32x8<Avx512>) -> [i32; 8usize] {
-                let lanes = _mm256_movm_epi32(a.val);
-                crate::transmute::checked_transmute_copy(&lanes)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u8x64<Avx512> {
+                _mm512_castps_si512(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
-        mask32x8 {
-            val: (bits & 255u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
-        u64::from((a).val) & 255u64
-    }
-    #[inline(always)]
-    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 8usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8usize
+    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_castps_si512(a.into()).simd_into(token)
+            }
         );
-        let bit = 1u64 << index;
-        let bits = u64::from((a).val);
-        let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask32x8 {
-            val: (bits) as _,
-            simd: self,
-        };
-    }
-    #[inline(always)]
-    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        mask32x8 {
-            val: ((u64::from((a).val) & u64::from((b).val)) & 255u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        mask32x8 {
-            val: ((u64::from((a).val) | u64::from((b).val)) & 255u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        mask32x8 {
-            val: ((u64::from((a).val) ^ u64::from((b).val)) & 255u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
-        mask32x8 {
-            val: ((!u64::from((a).val)) & 255u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn select_mask32x8(
-        self,
-        a: mask32x8<Self>,
-        b: mask32x8<Self>,
-        c: mask32x8<Self>,
-    ) -> mask32x8<Self> {
-        mask32x8 {
-            val: (((u64::from((a).val) & u64::from((b).val))
-                | ((!u64::from((a).val)) & u64::from((c).val)))
-                & 255u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        mask32x8 {
-            val: (!u64::from(a.val ^ b.val) & 255u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let bits = u64::from((a).val) & 255u64;
-        bits != 0
-    }
-    #[inline(always)]
-    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let bits = u64::from((a).val) & 255u64;
-        bits == 255u64
-    }
-    #[inline(always)]
-    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let bits = u64::from((a).val) & 255u64;
-        bits != 255u64
+        kernel(self, a)
     }
     #[inline(always)]
-    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let bits = u64::from((a).val) & 255u64;
-        bits == 0
+    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_cvttps_epu32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
-        let bits = (u64::from(a.val) | (u64::from(b.val) << 8usize)) & 65535u64;
-        mask32x16 {
-            val: bits as _,
-            simd: self,
-        }
+    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
+                let a = _mm512_max_ps(a.into(), _mm512_setzero_ps());
+                let mut converted = _mm512_cvttps_epu32(a);
+                let exceeds_unsigned_range =
+                    _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a);
+                converted = _mm512_mask_blend_epi32(
+                    exceeds_unsigned_range,
+                    converted,
+                    _mm512_set1_epi32(u32::MAX.cast_signed()),
+                );
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
-    #[inline(always)]
-    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
-        let bits = u64::from(a.val);
-        (
-            mask32x4 {
-                val: (bits & 15u64) as _,
-                simd: self,
-            },
-            mask32x4 {
-                val: ((bits >> 4usize) & 15u64) as _,
-                simd: self,
-            },
-        )
+    #[inline(always)]
+    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_cvttps_epi32(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
+    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: f64) -> f64x4<Avx512> {
-                _mm256_set1_pd(val).simd_into(token)
+            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
+                let a = a.into();
+                let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0));
+                let mut converted =
+                    _mm512_mask_cvttps_epi32(_mm512_set1_epi32(i32::MAX), in_range, a);
+                let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a);
+                converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted);
+                converted.simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: i8) -> i8x64<Avx512> {
+                _mm512_set1_epi8(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
+    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
+    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
-        crate::transmute::checked_transmute_copy::<__m256d, [f64; 4usize]>(&a.val.0)
+    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
-        crate::transmute::checked_cast_ref::<__m256d, [f64; 4usize]>(&a.val.0)
+    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
-        crate::transmute::checked_cast_mut::<__m256d, [f64; 4usize]>(&mut a.val.0)
+    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [i8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
+    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
-        f64x4 {
+    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
+        i8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f64x4<Avx512>,
-                b: f64x4<Avx512>,
+                a: i8x64<Avx512>,
+                b: i8x64<Avx512>,
                 shift: usize,
-            ) -> f64x4<Avx512> {
-                if shift >= 4usize {
+            ) -> i8x64<Avx512> {
+                if shift >= 64usize {
                     return b;
                 }
-                let idx = _mm256_add_epi8(
-                    _mm256_setr_epi8(
-                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                let idx = _mm512_add_epi8(
+                    _mm512_set_epi8(
+                        63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45,
+                        44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26,
+                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+                        5, 4, 3, 2, 1, 0,
                     ),
-                    _mm256_set1_epi8((shift * 8usize) as i8),
+                    _mm512_set1_epi8((shift) as i8),
                 );
-                let result = _mm256_permutex2var_epi8(
-                    token.cvt_to_bytes_f64x4(a).val.0,
+                let result = _mm512_permutex2var_epi8(
+                    token.cvt_to_bytes_i8x64(a).val.0,
                     idx,
-                    token.cvt_to_bytes_f64x4(b).val.0,
+                    token.cvt_to_bytes_i8x64(b).val.0,
                 );
-                token.cvt_from_bytes_f64x4(u8x32 {
-                    val: crate::support::Aligned256(result),
+                token.cvt_from_bytes_i8x64(u8x64 {
+                    val: crate::support::Aligned512(result),
                     simd: token,
                 })
             }
@@ -8204,140 +11192,195 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x4<const SHIFT: usize>(
+    fn slide_within_blocks_i8x64<const SHIFT: usize>(
         self,
-        a: f64x4<Self>,
-        b: f64x4<Self>,
-    ) -> f64x4<Self> {
+        a: i8x64<Self>,
+        b: i8x64<Self>,
+    ) -> i8x64<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 2usize {
+        if SHIFT >= 16usize {
             return b;
         }
-        let a = self.cvt_to_bytes_f64x4(a).val.0;
-        let b = self.cvt_to_bytes_f64x4(b).val.0;
-        let result = dyn_alignr_256(self, b, a, SHIFT * 8usize);
-        self.cvt_from_bytes_f64x4(u8x32 {
-            val: crate::support::Aligned256(result),
+        let a = self.cvt_to_bytes_i8x64(a).val.0;
+        let b = self.cvt_to_bytes_i8x64(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT);
+        self.cvt_from_bytes_i8x64(u8x64 {
+            val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_add_epi8(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_xor_pd(a.into(), _mm256_set1_pd(-0.0)).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_sub_epi8(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_sqrt_pd(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                let dst_even = _mm512_mullo_epi16(a.into(), b.into());
+                let dst_odd = _mm512_mullo_epi16(
+                    _mm512_srli_epi16::<8>(a.into()),
+                    _mm512_srli_epi16::<8>(b.into()),
+                );
+                _mm512_or_si512(
+                    _mm512_slli_epi16(dst_odd, 8),
+                    _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_rcp14_pd(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_add_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_sub_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_mul_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, shift: u32) -> i8x64<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm512_unpacklo_epi8(
+                    val,
+                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+                );
+                let hi_16 = _mm512_unpackhi_epi8(
+                    val,
+                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+                );
+                let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
+                let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
+                _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_div_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = zero;
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                let mask = _mm256_set1_pd(-0.0);
-                _mm256_or_pd(
-                    _mm256_and_pd(mask, b.into()),
-                    _mm256_andnot_pd(mask, a.into()),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, shift: u32) -> i8x64<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm512_unpacklo_epi8(
+                    val,
+                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+                );
+                let hi_16 = _mm512_unpackhi_epi8(
+                    val,
+                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
+                );
+                let lo_shifted = _mm512_sra_epi16(lo_16, shift_count);
+                let hi_shifted = _mm512_sra_epi16(hi_16, shift_count);
+                _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
-                mask64x4 {
-                    val: _mm256_cmp_pd_mask::<0i32>(a.into(), b.into()),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val));
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
-                mask64x4 {
-                    val: _mm256_cmp_pd_mask::<17i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpeq_epi8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -8345,12 +11388,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
-                mask64x4 {
-                    val: _mm256_cmp_pd_mask::<18i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmplt_epi8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -8358,12 +11401,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
-                mask64x4 {
-                    val: _mm256_cmp_pd_mask::<29i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmple_epi8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -8371,12 +11414,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> mask64x4<Avx512> {
-                mask64x4 {
-                    val: _mm256_cmp_pd_mask::<30i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpge_epi8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -8384,467 +11427,319 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 4, 1, 5), b.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpgt_epi8_mask(a.into(), b.into()),
+                    simd: token,
+                }
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(2, 6, 3, 7), b.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_permutex2var_epi8(
+                    a.into(),
+                    _mm512_set_epi8(
+                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
+                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
+                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
+                        66, 2, 65, 1, 64, 0,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(0, 2, 4, 6), b.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_permutex2var_epi8(
+                    a.into(),
+                    _mm512_set_epi8(
+                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_permutex2var_pd(a.into(), _mm256_setr_epi64x(1, 3, 5, 7), b.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_permutex2var_epi8(
+                    a.into(),
+                    _mm512_set_epi8(
+                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
+                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
+                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
+                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f64x4<Avx512>,
-                b: f64x4<Avx512>,
-            ) -> (f64x4<Avx512>, f64x4<Avx512>) {
-                let a = a.into();
-                let b = b.into();
-                (
-                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 4, 1, 5), b).simd_into(token),
-                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(2, 6, 3, 7), b).simd_into(token),
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_permutex2var_epi8(
+                    a.into(),
+                    _mm512_set_epi8(
+                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
+                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
+                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
+                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
+                    b.into(),
                 )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f64x4<Avx512>,
-                b: f64x4<Avx512>,
-            ) -> (f64x4<Avx512>, f64x4<Avx512>) {
+                a: i8x64<Avx512>,
+                b: i8x64<Avx512>,
+            ) -> (i8x64<Avx512>, i8x64<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(0, 2, 4, 6), b).simd_into(token),
-                    _mm256_permutex2var_pd(a, _mm256_setr_epi64x(1, 3, 5, 7), b).simd_into(token),
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23,
+                            86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14,
+                            77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68,
+                            4, 67, 3, 66, 2, 65, 1, 64, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                            119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                            111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                            103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_max_pd(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_min_pd(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_range_pd::<5i32>(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_range_pd::<4i32>(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f64x4<Avx512>,
-                b: f64x4<Avx512>,
-                c: f64x4<Avx512>,
-            ) -> f64x4<Avx512> {
-                _mm256_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b, c)
-    }
-    #[inline(always)]
-    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f64x4<Avx512>,
-                b: f64x4<Avx512>,
-                c: f64x4<Avx512>,
-            ) -> f64x4<Avx512> {
-                _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b, c)
-    }
-    #[inline(always)]
-    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+                a: i8x64<Avx512>,
+                b: i8x64<Avx512>,
+            ) -> (i8x64<Avx512>, i8x64<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100,
+                            98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
+                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
+                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                    _mm512_permutex2var_epi8(
+                        a,
+                        _mm512_set_epi8(
+                            127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101,
+                            99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
+                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
+                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                        ),
+                        b,
+                    )
+                    .simd_into(token),
+                )
             }
         );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        a - self.trunc_f64x4(a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f64x4<Avx512> {
-                _mm256_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(
+                token: Avx512,
+                a: mask8x64<Avx512>,
+                b: i8x64<Avx512>,
+                c: i8x64<Avx512>,
+            ) -> i8x64<Avx512> {
+                _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: mask64x4<Avx512>,
-                b: f64x4<Avx512>,
-                c: f64x4<Avx512>,
-            ) -> f64x4<Avx512> {
-                _mm256_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_min_epi8(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
+    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>, b: f64x4<Avx512>) -> f64x8<Avx512> {
-                _mm512_insertf64x4::<1>(_mm512_castpd256_pd512(a.into()), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_max_epi8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> (f64x2<Avx512>, f64x2<Avx512>) {
+            fn kernel(token: Avx512, a: i8x64<Avx512>) -> (i8x32<Avx512>, i8x32<Avx512>) {
                 (
-                    _mm256_extractf128_pd::<0>(a.into()).simd_into(token),
-                    _mm256_extractf128_pd::<1>(a.into()).simd_into(token),
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
                 )
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x4<Avx512>) -> f32x8<Avx512> {
-                _mm256_castpd_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i8x64<Avx512>) -> i8x64<Avx512> {
+                _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
-        mask64x4 {
-            val: (if val { 15u64 } else { 0 }) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: [i64; 4usize]) -> mask64x4<Avx512> {
-                let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask64x4 {
-                    val: _mm256_movepi64_mask(lanes),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, a: i8x64<Avx512>) -> u8x64<Avx512> {
+                __m512i::from(a).simd_into(token)
             }
         );
-        kernel(self, val)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: mask64x4<Avx512>) -> [i64; 4usize] {
-                let lanes = _mm256_movm_epi64(a.val);
-                crate::transmute::checked_transmute_copy(&lanes)
+            fn kernel(token: Avx512, a: i8x64<Avx512>) -> u32x16<Avx512> {
+                __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
-        mask64x4 {
-            val: (bits & 15u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
-        u64::from((a).val) & 15u64
-    }
-    #[inline(always)]
-    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 4usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4usize
-        );
-        let bit = 1u64 << index;
-        let bits = u64::from((a).val);
-        let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask64x4 {
-            val: (bits) as _,
-            simd: self,
-        };
-    }
-    #[inline(always)]
-    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        mask64x4 {
-            val: ((u64::from((a).val) & u64::from((b).val)) & 15u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        mask64x4 {
-            val: ((u64::from((a).val) | u64::from((b).val)) & 15u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        mask64x4 {
-            val: ((u64::from((a).val) ^ u64::from((b).val)) & 15u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
-        mask64x4 {
-            val: ((!u64::from((a).val)) & 15u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn select_mask64x4(
-        self,
-        a: mask64x4<Self>,
-        b: mask64x4<Self>,
-        c: mask64x4<Self>,
-    ) -> mask64x4<Self> {
-        mask64x4 {
-            val: (((u64::from((a).val) & u64::from((b).val))
-                | ((!u64::from((a).val)) & u64::from((c).val)))
-                & 15u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        mask64x4 {
-            val: (!u64::from(a.val ^ b.val) & 15u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let bits = u64::from((a).val) & 15u64;
-        bits != 0
-    }
-    #[inline(always)]
-    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let bits = u64::from((a).val) & 15u64;
-        bits == 15u64
-    }
-    #[inline(always)]
-    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let bits = u64::from((a).val) & 15u64;
-        bits != 15u64
-    }
-    #[inline(always)]
-    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let bits = u64::from((a).val) & 15u64;
-        bits == 0
-    }
-    #[inline(always)]
-    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
-        let bits = (u64::from(a.val) | (u64::from(b.val) << 4usize)) & 255u64;
-        mask64x8 {
-            val: bits as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
-        let bits = u64::from(a.val);
-        (
-            mask64x2 {
-                val: (bits & 3u64) as _,
-                simd: self,
-            },
-            mask64x2 {
-                val: ((bits >> 2usize) & 3u64) as _,
-                simd: self,
-            },
-        )
-    }
-    #[inline(always)]
-    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
+    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: f32) -> f32x16<Avx512> {
-                _mm512_set1_ps(val).simd_into(token)
+            fn kernel(token: Avx512, val: u8) -> u8x64<Avx512> {
+                _mm512_set1_epi8(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
+    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
+    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m512, [f32; 16usize]>(&a.val.0)
+    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
-        crate::transmute::checked_cast_ref::<__m512, [f32; 16usize]>(&a.val.0)
+    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
-        crate::transmute::checked_cast_mut::<__m512, [f32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [u8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
-        f32x16 {
+    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f32x16<Avx512>,
-                b: f32x16<Avx512>,
+                a: u8x64<Avx512>,
+                b: u8x64<Avx512>,
                 shift: usize,
-            ) -> f32x16<Avx512> {
-                if shift >= 16usize {
+            ) -> u8x64<Avx512> {
+                if shift >= 64usize {
                     return b;
                 }
                 let idx = _mm512_add_epi8(
@@ -8854,14 +11749,14 @@ impl Simd for Avx512 {
                         25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
                         5, 4, 3, 2, 1, 0,
                     ),
-                    _mm512_set1_epi8((shift * 4usize) as i8),
+                    _mm512_set1_epi8((shift) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_f32x16(a).val.0,
+                    token.cvt_to_bytes_u8x64(a).val.0,
                     idx,
-                    token.cvt_to_bytes_f32x16(b).val.0,
+                    token.cvt_to_bytes_u8x64(b).val.0,
                 );
-                token.cvt_from_bytes_f32x16(u8x64 {
+                token.cvt_from_bytes_u8x64(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -8870,127 +11765,183 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+    fn slide_within_blocks_u8x64<const SHIFT: usize>(
         self,
-        a: f32x16<Self>,
-        b: f32x16<Self>,
-    ) -> f32x16<Self> {
+        a: u8x64<Self>,
+        b: u8x64<Self>,
+    ) -> u8x64<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 4usize {
+        if SHIFT >= 16usize {
             return b;
         }
-        let a = self.cvt_to_bytes_f32x16(a).val.0;
-        let b = self.cvt_to_bytes_f32x16(b).val.0;
-        let result = dyn_alignr_512(self, b, a, SHIFT * 4usize);
-        self.cvt_from_bytes_f32x16(u8x64 {
+        let a = self.cvt_to_bytes_u8x64(a).val.0;
+        let b = self.cvt_to_bytes_u8x64(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT);
+        self.cvt_from_bytes_u8x64(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_andnot_ps(_mm512_set1_ps(-0.0), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_add_epi8(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_xor_ps(a.into(), _mm512_set1_ps(-0.0)).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_sub_epi8(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_sqrt_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                let dst_even = _mm512_mullo_epi16(a.into(), b.into());
+                let dst_odd = _mm512_mullo_epi16(
+                    _mm512_srli_epi16::<8>(a.into()),
+                    _mm512_srli_epi16::<8>(b.into()),
+                );
+                _mm512_or_si512(
+                    _mm512_slli_epi16(dst_odd, 8),
+                    _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_rcp14_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_add_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_sub_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_mul_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, shift: u32) -> u8x64<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
+                let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
+                let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
+                let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_div_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = zero;
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                let mask = _mm512_set1_ps(-0.0);
-                _mm512_or_ps(
-                    _mm512_and_ps(mask, b.into()),
-                    _mm512_andnot_ps(mask, a.into()),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, shift: u32) -> u8x64<Avx512> {
+                let val = a.into();
+                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
+                let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
+                let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
+                let lo_shifted = _mm512_srl_epi16(lo_16, shift_count);
+                let hi_shifted = _mm512_srl_epi16(hi_16, shift_count);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                let val = a.into();
+                let counts = b.into();
+                let zero = _mm512_setzero_si512();
+                let value_extend = zero;
+                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
+                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
+                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
+                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
+                let byte_mask = _mm512_set1_epi16(0x00ff);
+                let lo_shifted =
+                    _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask);
+                let hi_shifted =
+                    _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask);
+                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmp_ps_mask::<0i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpeq_epu8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -8998,12 +11949,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmp_ps_mask::<17i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmplt_epu8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9011,12 +11962,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmp_ps_mask::<18i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmple_epu8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9024,12 +11975,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmp_ps_mask::<29i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpge_epu8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9037,12 +11988,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmp_ps_mask::<30i32>(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
+                mask8x64 {
+                    val: _mm512_cmpgt_epu8_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9050,13 +12001,18 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_permutex2var_ps(
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_permutex2var_epi8(
                     a.into(),
-                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    _mm512_set_epi8(
+                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
+                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
+                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
+                        66, 2, 65, 1, 64, 0,
+                    ),
                     b.into(),
                 )
                 .simd_into(token)
@@ -9065,13 +12021,18 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_permutex2var_ps(
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_permutex2var_epi8(
                     a.into(),
-                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    _mm512_set_epi8(
+                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    ),
                     b.into(),
                 )
                 .simd_into(token)
@@ -9080,13 +12041,18 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_permutex2var_ps(
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_permutex2var_epi8(
                     a.into(),
-                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    _mm512_set_epi8(
+                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
+                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
+                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
+                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    ),
                     b.into(),
                 )
                 .simd_into(token)
@@ -9095,13 +12061,18 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_permutex2var_ps(
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_permutex2var_epi8(
                     a.into(),
-                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    _mm512_set_epi8(
+                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
+                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
+                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
+                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    ),
                     b.into(),
                 )
                 .simd_into(token)
@@ -9110,27 +12081,35 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f32x16<Avx512>,
-                b: f32x16<Avx512>,
-            ) -> (f32x16<Avx512>, f32x16<Avx512>) {
+                a: u8x64<Avx512>,
+                b: u8x64<Avx512>,
+            ) -> (u8x64<Avx512>, u8x64<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_ps(
+                    _mm512_permutex2var_epi8(
                         a,
-                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                        _mm512_set_epi8(
+                            95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23,
+                            86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14,
+                            77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68,
+                            4, 67, 3, 66, 2, 65, 1, 64, 0,
+                        ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_ps(
+                    _mm512_permutex2var_epi8(
                         a,
-                        _mm512_setr_epi32(
-                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+                        _mm512_set_epi8(
+                            127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
+                            119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
+                            111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
+                            103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
                         ),
                         b,
                     )
@@ -9141,29 +12120,35 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f32x16<Avx512>,
-                b: f32x16<Avx512>,
-            ) -> (f32x16<Avx512>, f32x16<Avx512>) {
+                a: u8x64<Avx512>,
+                b: u8x64<Avx512>,
+            ) -> (u8x64<Avx512>, u8x64<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_ps(
+                    _mm512_permutex2var_epi8(
                         a,
-                        _mm512_setr_epi32(
-                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+                        _mm512_set_epi8(
+                            126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100,
+                            98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
+                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
+                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                         ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_ps(
+                    _mm512_permutex2var_epi8(
                         a,
-                        _mm512_setr_epi32(
-                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+                        _mm512_set_epi8(
+                            127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101,
+                            99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
+                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
+                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                         ),
                         b,
                     )
@@ -9174,341 +12159,309 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_max_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(
+                token: Avx512,
+                a: mask8x64<Avx512>,
+                b: u8x64<Avx512>,
+                c: u8x64<Avx512>,
+            ) -> u8x64<Avx512> {
+                _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_min_ps(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_min_epu8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_range_ps::<5i32>(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+                _mm512_max_epu8(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, b: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_range_ps::<4i32>(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>) -> (u8x32<Avx512>, u8x32<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f32x16<Avx512>,
-                b: f32x16<Avx512>,
-                c: f32x16<Avx512>,
-            ) -> f32x16<Avx512> {
-                _mm512_fmadd_ps(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(token: Avx512, src: &[u8; 64usize]) -> u8x64<Avx512> {
+                let lanes: __m512i =
+                    crate::transmute::checked_transmute_copy::<[u8; 64usize], __m512i>(src);
+                _mm512_permutexvar_epi8(
+                    _mm512_set_epi8(
+                        63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54,
+                        50, 46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, 61, 57, 53, 49, 45, 41,
+                        37, 33, 29, 25, 21, 17, 13, 9, 5, 1, 60, 56, 52, 48, 44, 40, 36, 32, 28,
+                        24, 20, 16, 12, 8, 4, 0,
+                    ),
+                    lanes,
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, src)
     }
     #[inline(always)]
-    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f32x16<Avx512>,
-                b: f32x16<Avx512>,
-                c: f32x16<Avx512>,
-            ) -> f32x16<Avx512> {
-                _mm512_fmsub_ps(a.into(), b.into(), c.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>, dest: &mut [u8; 64usize]) -> () {
+                let lanes = _mm512_permutexvar_epi8(
+                    _mm512_set_epi8(
+                        63, 47, 31, 15, 62, 46, 30, 14, 61, 45, 29, 13, 60, 44, 28, 12, 59, 43, 27,
+                        11, 58, 42, 26, 10, 57, 41, 25, 9, 56, 40, 24, 8, 55, 39, 23, 7, 54, 38,
+                        22, 6, 53, 37, 21, 5, 52, 36, 20, 4, 51, 35, 19, 3, 50, 34, 18, 2, 49, 33,
+                        17, 1, 48, 32, 16, 0,
+                    ),
+                    a.into(),
+                );
+                crate::transmute::checked_transmute_store::<__m512i, [u8; 64usize]>(lanes, dest);
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, dest);
     }
     #[inline(always)]
-    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: u8x64<Avx512>) -> u32x16<Avx512> {
+                __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_roundscale_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
+        mask8x64 {
+            val: if val { u64::MAX } else { 0 },
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_roundscale_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, val: [i8; 64usize]) -> mask8x64<Avx512> {
+                let lanes = crate::transmute::checked_transmute_copy(&val);
+                mask8x64 {
+                    val: _mm512_movepi8_mask(lanes),
+                    simd: token,
+                }
             }
         );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        a - self.trunc_f32x16(a)
+        kernel(self, val)
     }
     #[inline(always)]
-    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_roundscale_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: mask8x64<Avx512>) -> [i8; 64usize] {
+                let lanes = _mm512_movm_epi8(a.val);
+                crate::transmute::checked_transmute_copy(&lanes)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: mask32x16<Avx512>,
-                b: f32x16<Avx512>,
-                c: f32x16<Avx512>,
-            ) -> f32x16<Avx512> {
-                _mm512_mask_blend_ps(a.val, c.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b, c)
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        mask8x64 {
+            val: bits & u64::MAX,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> (f32x8<Avx512>, f32x8<Avx512>) {
-                (
-                    _mm512_castps512_ps256(a.into()).simd_into(token),
-                    _mm512_extractf32x8_ps::<1>(a.into()).simd_into(token),
-                )
-            }
-        );
-        kernel(self, a)
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        u64::from((a).val) & u64::MAX
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> f64x8<Avx512> {
-                _mm512_castps_pd(a.into()).simd_into(token)
-            }
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
         );
-        kernel(self, a)
+        let bit = 1u64 << index;
+        let bits = u64::from((a).val);
+        let bits = if value { bits | bit } else { bits & !bit };
+        *a = mask8x64 {
+            val: bits,
+            simd: self,
+        };
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_castps_si512(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, src: &[f32; 16usize]) -> f32x16<Avx512> {
-                let lanes: __m512 =
-                    crate::transmute::checked_transmute_copy::<[f32; 16usize], __m512>(src);
-                _mm512_permutexvar_ps(
-                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
-                    lanes,
-                )
-                .simd_into(token)
-            }
-        );
-        kernel(self, src)
+    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) | u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (u64::from((a).val) ^ u64::from((b).val)) & u64::MAX,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>, dest: &mut [f32; 16usize]) -> () {
-                let lanes = _mm512_permutexvar_ps(
-                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
-                    a.into(),
-                );
-                crate::transmute::checked_transmute_store::<__m512, [f32; 16usize]>(lanes, dest);
-            }
-        );
-        kernel(self, a, dest);
+    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: (!u64::from((a).val)) & u64::MAX,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u8x64<Avx512> {
-                _mm512_castps_si512(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn select_mask8x64(
+        self,
+        a: mask8x64<Self>,
+        b: mask8x64<Self>,
+        c: mask8x64<Self>,
+    ) -> mask8x64<Self> {
+        mask8x64 {
+            val: ((u64::from((a).val) & u64::from((b).val))
+                | ((!u64::from((a).val)) & u64::from((c).val)))
+                & u64::MAX,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_castps_si512(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: !u64::from(a.val ^ b.val) & u64::MAX,
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_cvttps_epu32(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits != 0
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> u32x16<Avx512> {
-                let a = _mm512_max_ps(a.into(), _mm512_setzero_ps());
-                let mut converted = _mm512_cvttps_epu32(a);
-                let exceeds_unsigned_range =
-                    _mm512_cmp_ps_mask::<17i32>(_mm512_set1_ps(4294967040.0), a);
-                converted = _mm512_mask_blend_epi32(
-                    exceeds_unsigned_range,
-                    converted,
-                    _mm512_set1_epi32(u32::MAX.cast_signed()),
-                );
-                converted.simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits == u64::MAX
     }
     #[inline(always)]
-    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_cvttps_epi32(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits != u64::MAX
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f32x16<Avx512>) -> i32x16<Avx512> {
-                let a = a.into();
-                let in_range = _mm512_cmp_ps_mask::<17i32>(a, _mm512_set1_ps(2147483648.0));
-                let mut converted =
-                    _mm512_mask_cvttps_epi32(_mm512_set1_epi32(i32::MAX), in_range, a);
-                let is_not_nan = _mm512_cmp_ps_mask::<7i32>(a, a);
-                converted = _mm512_mask_blend_epi32(is_not_nan, _mm512_setzero_si512(), converted);
-                converted.simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let bits = u64::from((a).val) & u64::MAX;
+        bits == 0
     }
     #[inline(always)]
-    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+        let bits = u64::from(a.val);
+        (
+            mask8x32 {
+                val: (bits & 4294967295u64) as _,
+                simd: self,
+            },
+            mask8x32 {
+                val: ((bits >> 32usize) & 4294967295u64) as _,
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: i8) -> i8x64<Avx512> {
-                _mm512_set1_epi8(val).simd_into(token)
+            fn kernel(token: Avx512, val: i16) -> i16x32<Avx512> {
+                _mm512_set1_epi16(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<__m512i, [i8; 64usize]>(&a.val.0)
+    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
-        crate::transmute::checked_cast_ref::<__m512i, [i8; 64usize]>(&a.val.0)
+    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
-        crate::transmute::checked_cast_mut::<__m512i, [i8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [i16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
-        i8x64 {
+    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i8x64<Avx512>,
-                b: i8x64<Avx512>,
+                a: i16x32<Avx512>,
+                b: i16x32<Avx512>,
                 shift: usize,
-            ) -> i8x64<Avx512> {
-                if shift >= 64usize {
+            ) -> i16x32<Avx512> {
+                if shift >= 32usize {
                     return b;
                 }
                 let idx = _mm512_add_epi8(
@@ -9518,14 +12471,14 @@ impl Simd for Avx512 {
                         25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
                         5, 4, 3, 2, 1, 0,
                     ),
-                    _mm512_set1_epi8((shift) as i8),
+                    _mm512_set1_epi8((shift * 2usize) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_i8x64(a).val.0,
+                    token.cvt_to_bytes_i16x32(a).val.0,
                     idx,
-                    token.cvt_to_bytes_i8x64(b).val.0,
+                    token.cvt_to_bytes_i16x32(b).val.0,
                 );
-                token.cvt_from_bytes_i8x64(u8x64 {
+                token.cvt_from_bytes_i16x32(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -9534,195 +12487,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+    fn slide_within_blocks_i16x32<const SHIFT: usize>(
         self,
-        a: i8x64<Self>,
-        b: i8x64<Self>,
-    ) -> i8x64<Self> {
+        a: i16x32<Self>,
+        b: i16x32<Self>,
+    ) -> i16x32<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 16usize {
+        if SHIFT >= 8usize {
             return b;
         }
-        let a = self.cvt_to_bytes_i8x64(a).val.0;
-        let b = self.cvt_to_bytes_i8x64(b).val.0;
-        let result = dyn_alignr_512(self, b, a, SHIFT);
-        self.cvt_from_bytes_i8x64(u8x64 {
+        let a = self.cvt_to_bytes_i16x32(a).val.0;
+        let b = self.cvt_to_bytes_i16x32(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT * 2usize);
+        self.cvt_from_bytes_i16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_add_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_add_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_sub_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_sub_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                let dst_even = _mm512_mullo_epi16(a.into(), b.into());
-                let dst_odd = _mm512_mullo_epi16(
-                    _mm512_srli_epi16::<8>(a.into()),
-                    _mm512_srli_epi16::<8>(b.into()),
-                );
-                _mm512_or_si512(
-                    _mm512_slli_epi16(dst_odd, 8),
-                    _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_mullo_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
                 _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
                 _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
                 _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, shift: u32) -> i8x64<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm512_unpacklo_epi8(
-                    val,
-                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
-                );
-                let hi_16 = _mm512_unpackhi_epi8(
-                    val,
-                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
-                );
-                let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
-                let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
-                _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, shift: u32) -> i16x32<Avx512> {
+                _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm512_setzero_si512();
-                let value_extend = zero;
-                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm512_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
-                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_sllv_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, shift: u32) -> i8x64<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm512_unpacklo_epi8(
-                    val,
-                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
-                );
-                let hi_16 = _mm512_unpackhi_epi8(
-                    val,
-                    _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), val)),
-                );
-                let lo_shifted = _mm512_sra_epi16(lo_16, shift_count);
-                let hi_shifted = _mm512_sra_epi16(hi_16, shift_count);
-                _mm512_packs_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, shift: u32) -> i16x32<Avx512> {
+                _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm512_setzero_si512();
-                let value_extend = _mm512_movm_epi8(_mm512_cmpgt_epi8_mask(zero, val));
-                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm512_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm512_and_si512(_mm512_srav_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm512_and_si512(_mm512_srav_epi16(hi_values, hi_counts), byte_mask);
-                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_srav_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmpeq_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpeq_epi16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9730,12 +12624,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmplt_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmplt_epi16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9743,12 +12637,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmple_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmple_epi16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9756,12 +12650,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmpge_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpge_epi16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9769,12 +12663,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmpgt_epi8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpgt_epi16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -9782,17 +12676,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
-                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
-                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
-                        66, 2, 65, 1, 64, 0,
+                    _mm512_set_epi16(
+                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
+                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
                     ),
                     b.into(),
                 )
@@ -9802,17 +12694,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
-                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
-                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
-                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    _mm512_set_epi16(
+                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
+                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
                     ),
                     b.into(),
                 )
@@ -9822,17 +12712,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
-                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
-                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
-                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    _mm512_set_epi16(
+                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                     ),
                     b.into(),
                 )
@@ -9842,17 +12730,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
-                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
-                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
-                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    _mm512_set_epi16(
+                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
+                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                     ),
                     b.into(),
                 )
@@ -9862,35 +12748,31 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i8x64<Avx512>,
-                b: i8x64<Avx512>,
-            ) -> (i8x64<Avx512>, i8x64<Avx512>) {
+                a: i16x32<Avx512>,
+                b: i16x32<Avx512>,
+            ) -> (i16x32<Avx512>, i16x32<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23,
-                            86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14,
-                            77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68,
-                            4, 67, 3, 66, 2, 65, 1, 64, 0,
+                        _mm512_set_epi16(
+                            47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7,
+                            38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
                         ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
-                            119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
-                            111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
-                            103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                        _mm512_set_epi16(
+                            63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23,
+                            54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
                         ),
                         b,
                     )
@@ -9901,33 +12783,29 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i8x64<Avx512>,
-                b: i8x64<Avx512>,
-            ) -> (i8x64<Avx512>, i8x64<Avx512>) {
+                a: i16x32<Avx512>,
+                b: i16x32<Avx512>,
+            ) -> (i16x32<Avx512>, i16x32<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100,
-                            98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
+                        _mm512_set_epi16(
                             62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
                             26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                         ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101,
-                            99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
+                        _mm512_set_epi16(
                             63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
                             27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                         ),
@@ -9940,45 +12818,45 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
+    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask8x64<Avx512>,
-                b: i8x64<Avx512>,
-                c: i8x64<Avx512>,
-            ) -> i8x64<Avx512> {
-                _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+                a: mask16x32<Avx512>,
+                b: i16x32<Avx512>,
+                c: i16x32<Avx512>,
+            ) -> i16x32<Avx512> {
+                _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_min_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_min_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>, b: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_max_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_max_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>) -> (i8x32<Avx512>, i8x32<Avx512>) {
+            fn kernel(token: Avx512, a: i16x32<Avx512>) -> (i16x16<Avx512>, i16x16<Avx512>) {
                 (
                     _mm512_castsi512_si256(a.into()).simd_into(token),
                     _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
@@ -9988,100 +12866,100 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>) -> i8x64<Avx512> {
-                _mm512_sub_epi8(_mm512_setzero_si512(), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i16x32<Avx512>) -> i16x32<Avx512> {
+                _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, a: i16x32<Avx512>) -> u8x64<Avx512> {
                 __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
+    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i8x64<Avx512>) -> u32x16<Avx512> {
+            fn kernel(token: Avx512, a: i16x32<Avx512>) -> u32x16<Avx512> {
                 __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
+    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: u8) -> u8x64<Avx512> {
-                _mm512_set1_epi8(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx512, val: u16) -> u16x32<Avx512> {
+                _mm512_set1_epi16(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
-    #[inline(always)]
-    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    #[inline(always)]
+    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
-        crate::transmute::checked_transmute_copy::<__m512i, [u8; 64usize]>(&a.val.0)
+    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
-        crate::transmute::checked_cast_ref::<__m512i, [u8; 64usize]>(&a.val.0)
+    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
-        crate::transmute::checked_cast_mut::<__m512i, [u8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [u16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u8x64<Avx512>,
-                b: u8x64<Avx512>,
+                a: u16x32<Avx512>,
+                b: u16x32<Avx512>,
                 shift: usize,
-            ) -> u8x64<Avx512> {
-                if shift >= 64usize {
+            ) -> u16x32<Avx512> {
+                if shift >= 32usize {
                     return b;
                 }
                 let idx = _mm512_add_epi8(
@@ -10091,14 +12969,14 @@ impl Simd for Avx512 {
                         25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
                         5, 4, 3, 2, 1, 0,
                     ),
-                    _mm512_set1_epi8((shift) as i8),
+                    _mm512_set1_epi8((shift * 2usize) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_u8x64(a).val.0,
+                    token.cvt_to_bytes_u16x32(a).val.0,
                     idx,
-                    token.cvt_to_bytes_u8x64(b).val.0,
+                    token.cvt_to_bytes_u16x32(b).val.0,
                 );
-                token.cvt_from_bytes_u8x64(u8x64 {
+                token.cvt_from_bytes_u16x32(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -10107,183 +12985,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+    fn slide_within_blocks_u16x32<const SHIFT: usize>(
         self,
-        a: u8x64<Self>,
-        b: u8x64<Self>,
-    ) -> u8x64<Self> {
+        a: u16x32<Self>,
+        b: u16x32<Self>,
+    ) -> u16x32<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 16usize {
+        if SHIFT >= 8usize {
             return b;
         }
-        let a = self.cvt_to_bytes_u8x64(a).val.0;
-        let b = self.cvt_to_bytes_u8x64(b).val.0;
-        let result = dyn_alignr_512(self, b, a, SHIFT);
-        self.cvt_from_bytes_u8x64(u8x64 {
+        let a = self.cvt_to_bytes_u16x32(a).val.0;
+        let b = self.cvt_to_bytes_u16x32(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT * 2usize);
+        self.cvt_from_bytes_u16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_add_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_add_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_sub_epi8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_sub_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                let dst_even = _mm512_mullo_epi16(a.into(), b.into());
-                let dst_odd = _mm512_mullo_epi16(
-                    _mm512_srli_epi16::<8>(a.into()),
-                    _mm512_srli_epi16::<8>(b.into()),
-                );
-                _mm512_or_si512(
-                    _mm512_slli_epi16(dst_odd, 8),
-                    _mm512_and_si512(dst_even, _mm512_set1_epi16(0xFF)),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_mullo_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
                 _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
                 _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
                 _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, shift: u32) -> u8x64<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
-                let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
-                let lo_shifted = _mm512_sll_epi16(lo_16, shift_count);
-                let hi_shifted = _mm512_sll_epi16(hi_16, shift_count);
-                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, shift: u32) -> u16x32<Avx512> {
+                _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm512_setzero_si512();
-                let value_extend = zero;
-                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm512_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm512_and_si512(_mm512_sllv_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm512_and_si512(_mm512_sllv_epi16(hi_values, hi_counts), byte_mask);
-                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_sllv_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, shift: u32) -> u8x64<Avx512> {
-                let val = a.into();
-                let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
-                let lo_16 = _mm512_unpacklo_epi8(val, _mm512_setzero_si512());
-                let hi_16 = _mm512_unpackhi_epi8(val, _mm512_setzero_si512());
-                let lo_shifted = _mm512_srl_epi16(lo_16, shift_count);
-                let hi_shifted = _mm512_srl_epi16(hi_16, shift_count);
-                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, shift: u32) -> u16x32<Avx512> {
+                _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                let val = a.into();
-                let counts = b.into();
-                let zero = _mm512_setzero_si512();
-                let value_extend = zero;
-                let lo_values = _mm512_unpacklo_epi8(val, value_extend);
-                let hi_values = _mm512_unpackhi_epi8(val, value_extend);
-                let lo_counts = _mm512_unpacklo_epi8(counts, zero);
-                let hi_counts = _mm512_unpackhi_epi8(counts, zero);
-                let byte_mask = _mm512_set1_epi16(0x00ff);
-                let lo_shifted =
-                    _mm512_and_si512(_mm512_srlv_epi16(lo_values, lo_counts), byte_mask);
-                let hi_shifted =
-                    _mm512_and_si512(_mm512_srlv_epi16(hi_values, hi_counts), byte_mask);
-                _mm512_packus_epi16(lo_shifted, hi_shifted).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_srlv_epi16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmpeq_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpeq_epu16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10291,12 +13122,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmplt_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmplt_epu16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10304,12 +13135,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmple_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmple_epu16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10317,12 +13148,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmpge_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpge_epu16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10330,12 +13161,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> mask8x64<Avx512> {
-                mask8x64 {
-                    val: _mm512_cmpgt_epu8_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
+                mask16x32 {
+                    val: _mm512_cmpgt_epu16_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10343,17 +13174,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23, 86,
-                        22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14, 77, 13,
-                        76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68, 4, 67, 3,
-                        66, 2, 65, 1, 64, 0,
+                    _mm512_set_epi16(
+                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
+                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
                     ),
                     b.into(),
                 )
@@ -10363,17 +13192,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
-                        119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
-                        111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
-                        103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                    _mm512_set_epi16(
+                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
+                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
                     ),
                     b.into(),
                 )
@@ -10383,17 +13210,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98,
-                        96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60,
-                        58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22,
-                        20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                    _mm512_set_epi16(
+                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                     ),
                     b.into(),
                 )
@@ -10403,17 +13228,15 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_permutex2var_epi8(
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_permutex2var_epi16(
                     a.into(),
-                    _mm512_set_epi8(
-                        127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99,
-                        97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, 63, 61,
-                        59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27, 25, 23,
-                        21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                    _mm512_set_epi16(
+                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
+                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                     ),
                     b.into(),
                 )
@@ -10423,35 +13246,31 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u8x64<Avx512>,
-                b: u8x64<Avx512>,
-            ) -> (u8x64<Avx512>, u8x64<Avx512>) {
+                a: u16x32<Avx512>,
+                b: u16x32<Avx512>,
+            ) -> (u16x32<Avx512>, u16x32<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, 87, 23,
-                            86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, 79, 15, 78, 14,
-                            77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, 71, 7, 70, 6, 69, 5, 68,
-                            4, 67, 3, 66, 2, 65, 1, 64, 0,
+                        _mm512_set_epi16(
+                            47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7,
+                            38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
                         ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56,
-                            119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48,
-                            111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40,
-                            103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32,
+                        _mm512_set_epi16(
+                            63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23,
+                            54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
                         ),
                         b,
                     )
@@ -10462,33 +13281,29 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u8x64<Avx512>,
-                b: u8x64<Avx512>,
-            ) -> (u8x64<Avx512>, u8x64<Avx512>) {
+                a: u16x32<Avx512>,
+                b: u16x32<Avx512>,
+            ) -> (u16x32<Avx512>, u16x32<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100,
-                            98, 96, 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64,
+                        _mm512_set_epi16(
                             62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
                             26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
                         ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi8(
+                    _mm512_permutex2var_epi16(
                         a,
-                        _mm512_set_epi8(
-                            127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101,
-                            99, 97, 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65,
+                        _mm512_set_epi16(
                             63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
                             27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
                         ),
@@ -10501,45 +13316,45 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
+    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask8x64<Avx512>,
-                b: u8x64<Avx512>,
-                c: u8x64<Avx512>,
-            ) -> u8x64<Avx512> {
-                _mm512_mask_blend_epi8(a.val, c.into(), b.into()).simd_into(token)
+                a: mask16x32<Avx512>,
+                b: u16x32<Avx512>,
+                c: u16x32<Avx512>,
+            ) -> u16x32<Avx512> {
+                _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_min_epu8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_min_epu16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, b: u8x64<Avx512>) -> u8x64<Avx512> {
-                _mm512_max_epu8(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+                _mm512_max_epu16(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>) -> (u8x32<Avx512>, u8x32<Avx512>) {
+            fn kernel(token: Avx512, a: u16x32<Avx512>) -> (u16x16<Avx512>, u16x16<Avx512>) {
                 (
                     _mm512_castsi512_si256(a.into()).simd_into(token),
                     _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
@@ -10549,18 +13364,16 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
+    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, src: &[u8; 64usize]) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, src: &[u16; 32usize]) -> u16x32<Avx512> {
                 let lanes: __m512i =
-                    crate::transmute::checked_transmute_copy::<[u8; 64usize], __m512i>(src);
-                _mm512_permutexvar_epi8(
-                    _mm512_set_epi8(
-                        63, 59, 55, 51, 47, 43, 39, 35, 31, 27, 23, 19, 15, 11, 7, 3, 62, 58, 54,
-                        50, 46, 42, 38, 34, 30, 26, 22, 18, 14, 10, 6, 2, 61, 57, 53, 49, 45, 41,
-                        37, 33, 29, 25, 21, 17, 13, 9, 5, 1, 60, 56, 52, 48, 44, 40, 36, 32, 28,
-                        24, 20, 16, 12, 8, 4, 0,
+                    crate::transmute::checked_transmute_copy::<[u16; 32usize], __m512i>(src);
+                _mm512_permutexvar_epi16(
+                    _mm512_set_epi16(
+                        31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17,
+                        13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0,
                     ),
                     lanes,
                 )
@@ -10570,49 +13383,67 @@ impl Simd for Avx512 {
         kernel(self, src)
     }
     #[inline(always)]
-    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>, dest: &mut [u8; 64usize]) -> () {
-                let lanes = _mm512_permutexvar_epi8(
-                    _mm512_set_epi8(
-                        63, 47, 31, 15, 62, 46, 30, 14, 61, 45, 29, 13, 60, 44, 28, 12, 59, 43, 27,
-                        11, 58, 42, 26, 10, 57, 41, 25, 9, 56, 40, 24, 8, 55, 39, 23, 7, 54, 38,
-                        22, 6, 53, 37, 21, 5, 52, 36, 20, 4, 51, 35, 19, 3, 50, 34, 18, 2, 49, 33,
-                        17, 1, 48, 32, 16, 0,
+            fn kernel(token: Avx512, a: u16x32<Avx512>, dest: &mut [u16; 32usize]) -> () {
+                let lanes = _mm512_permutexvar_epi16(
+                    _mm512_set_epi16(
+                        31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, 27, 19, 11, 3,
+                        26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0,
                     ),
                     a.into(),
                 );
-                crate::transmute::checked_transmute_store::<__m512i, [u8; 64usize]>(lanes, dest);
+                crate::transmute::checked_transmute_store::<__m512i, [u16; 32usize]>(lanes, dest);
             }
         );
         kernel(self, a, dest);
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
+    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u8x64<Avx512>) -> u32x16<Avx512> {
+            fn kernel(token: Avx512, a: u16x32<Avx512>) -> u8x32<Avx512> {
+                _mm512_cvtepi16_epi8(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>) -> u8x64<Avx512> {
                 __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
-        mask8x64 {
-            val: if val { u64::MAX } else { 0 },
+    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u16x32<Avx512>) -> u32x16<Avx512> {
+                __m512i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
+        mask16x32 {
+            val: (if val { 4294967295u64 } else { 0 }) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
+    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: [i8; 64usize]) -> mask8x64<Avx512> {
+            fn kernel(token: Avx512, val: [i16; 32usize]) -> mask16x32<Avx512> {
                 let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask8x64 {
-                    val: _mm512_movepi8_mask(lanes),
+                mask16x32 {
+                    val: _mm512_movepi16_mask(lanes),
                     simd: token,
                 }
             }
@@ -10620,190 +13451,190 @@ impl Simd for Avx512 {
         kernel(self, val)
     }
     #[inline(always)]
-    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
+    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: mask8x64<Avx512>) -> [i8; 64usize] {
-                let lanes = _mm512_movm_epi8(a.val);
+            fn kernel(token: Avx512, a: mask16x32<Avx512>) -> [i16; 32usize] {
+                let lanes = _mm512_movm_epi16(a.val);
                 crate::transmute::checked_transmute_copy(&lanes)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
-        mask8x64 {
-            val: bits & u64::MAX,
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        mask16x32 {
+            val: (bits & 4294967295u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
-        u64::from((a).val) & u64::MAX
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        u64::from((a).val) & 4294967295u64
     }
     #[inline(always)]
-    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 64usize,
+            index < 32usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            64usize
+            32usize
         );
         let bit = 1u64 << index;
         let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask8x64 {
-            val: bits,
+        *a = mask16x32 {
+            val: (bits) as _,
             simd: self,
         };
     }
     #[inline(always)]
-    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: (u64::from((a).val) & u64::from((b).val)) & u64::MAX,
+    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: (u64::from((a).val) | u64::from((b).val)) & u64::MAX,
+    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: (u64::from((a).val) ^ u64::from((b).val)) & u64::MAX,
+    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: (!u64::from((a).val)) & u64::MAX,
+    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: ((!u64::from((a).val)) & 4294967295u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn select_mask8x64(
+    fn select_mask16x32(
         self,
-        a: mask8x64<Self>,
-        b: mask8x64<Self>,
-        c: mask8x64<Self>,
-    ) -> mask8x64<Self> {
-        mask8x64 {
-            val: ((u64::from((a).val) & u64::from((b).val))
+        a: mask16x32<Self>,
+        b: mask16x32<Self>,
+        c: mask16x32<Self>,
+    ) -> mask16x32<Self> {
+        mask16x32 {
+            val: (((u64::from((a).val) & u64::from((b).val))
                 | ((!u64::from((a).val)) & u64::from((c).val)))
-                & u64::MAX,
+                & 4294967295u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: !u64::from(a.val ^ b.val) & u64::MAX,
+    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let bits = u64::from((a).val) & u64::MAX;
+    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
         bits != 0
     }
     #[inline(always)]
-    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let bits = u64::from((a).val) & u64::MAX;
-        bits == u64::MAX
+    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits == 4294967295u64
     }
     #[inline(always)]
-    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let bits = u64::from((a).val) & u64::MAX;
-        bits != u64::MAX
+    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
+        bits != 4294967295u64
     }
     #[inline(always)]
-    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let bits = u64::from((a).val) & u64::MAX;
+    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let bits = u64::from((a).val) & 4294967295u64;
         bits == 0
     }
     #[inline(always)]
-    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
         let bits = u64::from(a.val);
         (
-            mask8x32 {
-                val: (bits & 4294967295u64) as _,
+            mask16x16 {
+                val: (bits & 65535u64) as _,
                 simd: self,
             },
-            mask8x32 {
-                val: ((bits >> 32usize) & 4294967295u64) as _,
+            mask16x16 {
+                val: ((bits >> 16usize) & 65535u64) as _,
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
+    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: i16) -> i16x32<Avx512> {
-                _mm512_set1_epi16(val).simd_into(token)
+            fn kernel(token: Avx512, val: i32) -> i32x16<Avx512> {
+                _mm512_set1_epi32(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<__m512i, [i16; 32usize]>(&a.val.0)
+    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
-        crate::transmute::checked_cast_ref::<__m512i, [i16; 32usize]>(&a.val.0)
+    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
-        crate::transmute::checked_cast_mut::<__m512i, [i16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [i32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i16x32<Avx512>,
-                b: i16x32<Avx512>,
+                a: i32x16<Avx512>,
+                b: i32x16<Avx512>,
                 shift: usize,
-            ) -> i16x32<Avx512> {
-                if shift >= 32usize {
+            ) -> i32x16<Avx512> {
+                if shift >= 16usize {
                     return b;
                 }
                 let idx = _mm512_add_epi8(
@@ -10813,14 +13644,14 @@ impl Simd for Avx512 {
                         25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
                         5, 4, 3, 2, 1, 0,
                     ),
-                    _mm512_set1_epi8((shift * 2usize) as i8),
+                    _mm512_set1_epi8((shift * 4usize) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_i16x32(a).val.0,
+                    token.cvt_to_bytes_i32x16(a).val.0,
                     idx,
-                    token.cvt_to_bytes_i16x32(b).val.0,
+                    token.cvt_to_bytes_i32x16(b).val.0,
                 );
-                token.cvt_from_bytes_i16x32(u8x64 {
+                token.cvt_from_bytes_i32x16(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -10829,136 +13660,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+    fn slide_within_blocks_i32x16<const SHIFT: usize>(
         self,
-        a: i16x32<Self>,
-        b: i16x32<Self>,
-    ) -> i16x32<Self> {
+        a: i32x16<Self>,
+        b: i32x16<Self>,
+    ) -> i32x16<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 8usize {
+        if SHIFT >= 4usize {
             return b;
         }
-        let a = self.cvt_to_bytes_i16x32(a).val.0;
-        let b = self.cvt_to_bytes_i16x32(b).val.0;
-        let result = dyn_alignr_512(self, b, a, SHIFT * 2usize);
-        self.cvt_from_bytes_i16x32(u8x64 {
+        let a = self.cvt_to_bytes_i32x16(a).val.0;
+        let b = self.cvt_to_bytes_i32x16(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT * 4usize);
+        self.cvt_from_bytes_i32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_add_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_add_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_sub_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_sub_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_mullo_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_mullo_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
                 _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
                 _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
                 _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, shift: u32) -> i16x32<Avx512> {
-                _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, shift: u32) -> i32x16<Avx512> {
+                _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_sllv_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_sllv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, shift: u32) -> i16x32<Avx512> {
-                _mm512_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, shift: u32) -> i32x16<Avx512> {
+                _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_srav_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_srav_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmpeq_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpeq_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10966,12 +13797,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmplt_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmplt_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10979,12 +13810,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmple_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmple_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -10992,12 +13823,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmpge_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpge_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -11005,12 +13836,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmpgt_epi16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpgt_epi32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -11018,16 +13849,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_permutex2var_epi16(
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
                     a.into(),
-                    _mm512_set_epi16(
-                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
-                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
-                    ),
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11036,16 +13864,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_permutex2var_epi16(
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
                     a.into(),
-                    _mm512_set_epi16(
-                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
-                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
-                    ),
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11054,16 +13879,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_permutex2var_epi16(
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
                     a.into(),
-                    _mm512_set_epi16(
-                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
-                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
-                    ),
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11072,16 +13894,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_permutex2var_epi16(
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_permutex2var_epi32(
                     a.into(),
-                    _mm512_set_epi16(
-                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
-                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
-                    ),
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11090,31 +13909,27 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i16x32<Avx512>,
-                b: i16x32<Avx512>,
-            ) -> (i16x32<Avx512>, i16x32<Avx512>) {
+                a: i32x16<Avx512>,
+                b: i32x16<Avx512>,
+            ) -> (i32x16<Avx512>, i32x16<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7,
-                            38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
-                        ),
+                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23,
-                            54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                        _mm512_setr_epi32(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
                         ),
                         b,
                     )
@@ -11125,31 +13940,29 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i16x32<Avx512>,
-                b: i16x32<Avx512>,
-            ) -> (i16x32<Avx512>, i16x32<Avx512>) {
+                a: i32x16<Avx512>,
+                b: i32x16<Avx512>,
+            ) -> (i32x16<Avx512>, i32x16<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
-                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                        _mm512_setr_epi32(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
                         ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
-                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                        _mm512_setr_epi32(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
                         ),
                         b,
                     )
@@ -11160,45 +13973,45 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
+    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
-                token: Avx512,
-                a: mask16x32<Avx512>,
-                b: i16x32<Avx512>,
-                c: i16x32<Avx512>,
-            ) -> i16x32<Avx512> {
-                _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+                token: Avx512,
+                a: mask32x16<Avx512>,
+                b: i32x16<Avx512>,
+                c: i32x16<Avx512>,
+            ) -> i32x16<Avx512> {
+                _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_min_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_min_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>, b: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_max_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_max_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>) -> (i16x16<Avx512>, i16x16<Avx512>) {
+            fn kernel(token: Avx512, a: i32x16<Avx512>) -> (i32x8<Avx512>, i32x8<Avx512>) {
                 (
                     _mm512_castsi512_si256(a.into()).simd_into(token),
                     _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
@@ -11208,100 +14021,110 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>) -> i16x32<Avx512> {
-                _mm512_sub_epi16(_mm512_setzero_si512(), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>) -> i32x16<Avx512> {
+                _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, a: i32x16<Avx512>) -> u8x64<Avx512> {
                 __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
+    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i16x32<Avx512>) -> u32x16<Avx512> {
+            fn kernel(token: Avx512, a: i32x16<Avx512>) -> u32x16<Avx512> {
                 __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
+    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: u16) -> u16x32<Avx512> {
-                _mm512_set1_epi16(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx512, a: i32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_cvtepi32_ps(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, val: u32) -> u32x16<Avx512> {
+                _mm512_set1_epi32(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
-        crate::transmute::checked_transmute_copy::<__m512i, [u16; 32usize]>(&a.val.0)
+    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
-        crate::transmute::checked_cast_ref::<__m512i, [u16; 32usize]>(&a.val.0)
+    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
-        crate::transmute::checked_cast_mut::<__m512i, [u16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [u32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u16x32<Avx512>,
-                b: u16x32<Avx512>,
+                a: u32x16<Avx512>,
+                b: u32x16<Avx512>,
                 shift: usize,
-            ) -> u16x32<Avx512> {
-                if shift >= 32usize {
+            ) -> u32x16<Avx512> {
+                if shift >= 16usize {
                     return b;
                 }
                 let idx = _mm512_add_epi8(
@@ -11311,14 +14134,14 @@ impl Simd for Avx512 {
                         25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
                         5, 4, 3, 2, 1, 0,
                     ),
-                    _mm512_set1_epi8((shift * 2usize) as i8),
+                    _mm512_set1_epi8((shift * 4usize) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_u16x32(a).val.0,
+                    token.cvt_to_bytes_u32x16(a).val.0,
                     idx,
-                    token.cvt_to_bytes_u16x32(b).val.0,
+                    token.cvt_to_bytes_u32x16(b).val.0,
                 );
-                token.cvt_from_bytes_u16x32(u8x64 {
+                token.cvt_from_bytes_u32x16(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -11327,136 +14150,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+    fn slide_within_blocks_u32x16<const SHIFT: usize>(
         self,
-        a: u16x32<Self>,
-        b: u16x32<Self>,
-    ) -> u16x32<Self> {
+        a: u32x16<Self>,
+        b: u32x16<Self>,
+    ) -> u32x16<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 8usize {
+        if SHIFT >= 4usize {
             return b;
         }
-        let a = self.cvt_to_bytes_u16x32(a).val.0;
-        let b = self.cvt_to_bytes_u16x32(b).val.0;
-        let result = dyn_alignr_512(self, b, a, SHIFT * 2usize);
-        self.cvt_from_bytes_u16x32(u8x64 {
+        let a = self.cvt_to_bytes_u32x16(a).val.0;
+        let b = self.cvt_to_bytes_u32x16(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT * 4usize);
+        self.cvt_from_bytes_u32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_add_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_add_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_sub_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_sub_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_mullo_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_mullo_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
                 _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
                 _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
                 _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
+    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, shift: u32) -> u16x32<Avx512> {
-                _mm512_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, shift: u32) -> u32x16<Avx512> {
+                _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_sllv_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_sllv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, shift: u32) -> u16x32<Avx512> {
-                _mm512_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, shift: u32) -> u32x16<Avx512> {
+                _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_srlv_epi16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_srlv_epi32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmpeq_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpeq_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -11464,12 +14287,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmplt_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmplt_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -11477,12 +14300,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmple_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmple_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -11490,12 +14313,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmpge_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpge_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -11503,12 +14326,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> mask16x32<Avx512> {
-                mask16x32 {
-                    val: _mm512_cmpgt_epu16_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
+                mask32x16 {
+                    val: _mm512_cmpgt_epu32_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -11516,16 +14339,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_permutex2var_epi16(
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
                     a.into(),
-                    _mm512_set_epi16(
-                        47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7, 38, 6,
-                        37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
-                    ),
+                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11534,16 +14354,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_permutex2var_epi16(
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
                     a.into(),
-                    _mm512_set_epi16(
-                        63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23, 54,
-                        22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
-                    ),
+                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11552,16 +14369,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_permutex2var_epi16(
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
                     a.into(),
-                    _mm512_set_epi16(
-                        62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
-                        24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
-                    ),
+                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11570,16 +14384,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_permutex2var_epi16(
-                    a.into(),
-                    _mm512_set_epi16(
-                        63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29, 27,
-                        25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
-                    ),
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_permutex2var_epi32(
+                    a.into(),
+                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
                     b.into(),
                 )
                 .simd_into(token)
@@ -11588,31 +14399,27 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u16x32<Avx512>,
-                b: u16x32<Avx512>,
-            ) -> (u16x32<Avx512>, u16x32<Avx512>) {
+                a: u32x16<Avx512>,
+                b: u32x16<Avx512>,
+            ) -> (u32x16<Avx512>, u32x16<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, 39, 7,
-                            38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0,
-                        ),
+                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, 55, 23,
-                            54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16,
+                        _mm512_setr_epi32(
+                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
                         ),
                         b,
                     )
@@ -11623,31 +14430,29 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u16x32<Avx512>,
-                b: u16x32<Avx512>,
-            ) -> (u16x32<Avx512>, u16x32<Avx512>) {
+                a: u32x16<Avx512>,
+                b: u32x16<Avx512>,
+            ) -> (u32x16<Avx512>, u32x16<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28,
-                            26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
+                        _mm512_setr_epi32(
+                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
                         ),
                         b,
                     )
                     .simd_into(token),
-                    _mm512_permutex2var_epi16(
+                    _mm512_permutex2var_epi32(
                         a,
-                        _mm512_set_epi16(
-                            63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, 31, 29,
-                            27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1,
+                        _mm512_setr_epi32(
+                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
                         ),
                         b,
                     )
@@ -11658,45 +14463,45 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
+    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask16x32<Avx512>,
-                b: u16x32<Avx512>,
-                c: u16x32<Avx512>,
-            ) -> u16x32<Avx512> {
-                _mm512_mask_blend_epi16(a.val, c.into(), b.into()).simd_into(token)
+                a: mask32x16<Avx512>,
+                b: u32x16<Avx512>,
+                c: u32x16<Avx512>,
+            ) -> u32x16<Avx512> {
+                _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_min_epu16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_min_epu32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, b: u16x32<Avx512>) -> u16x32<Avx512> {
-                _mm512_max_epu16(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+                _mm512_max_epu32(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>) -> (u16x16<Avx512>, u16x16<Avx512>) {
+            fn kernel(token: Avx512, a: u32x16<Avx512>) -> (u32x8<Avx512>, u32x8<Avx512>) {
                 (
                     _mm512_castsi512_si256(a.into()).simd_into(token),
                     _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
@@ -11706,17 +14511,14 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, src: &[u16; 32usize]) -> u16x32<Avx512> {
+            fn kernel(token: Avx512, src: &[u32; 16usize]) -> u32x16<Avx512> {
                 let lanes: __m512i =
-                    crate::transmute::checked_transmute_copy::<[u16; 32usize], __m512i>(src);
-                _mm512_permutexvar_epi16(
-                    _mm512_set_epi16(
-                        31, 27, 23, 19, 15, 11, 7, 3, 30, 26, 22, 18, 14, 10, 6, 2, 29, 25, 21, 17,
-                        13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0,
-                    ),
+                    crate::transmute::checked_transmute_copy::<[u32; 16usize], __m512i>(src);
+                _mm512_permutexvar_epi32(
+                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
                     lanes,
                 )
                 .simd_into(token)
@@ -11725,67 +14527,54 @@ impl Simd for Avx512 {
         kernel(self, src)
     }
     #[inline(always)]
-    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>, dest: &mut [u16; 32usize]) -> () {
-                let lanes = _mm512_permutexvar_epi16(
-                    _mm512_set_epi16(
-                        31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, 27, 19, 11, 3,
-                        26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0,
-                    ),
+            fn kernel(token: Avx512, a: u32x16<Avx512>, dest: &mut [u32; 16usize]) -> () {
+                let lanes = _mm512_permutexvar_epi32(
+                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
                     a.into(),
                 );
-                crate::transmute::checked_transmute_store::<__m512i, [u16; 32usize]>(lanes, dest);
+                crate::transmute::checked_transmute_store::<__m512i, [u32; 16usize]>(lanes, dest);
             }
         );
         kernel(self, a, dest);
     }
     #[inline(always)]
-    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>) -> u8x32<Avx512> {
-                _mm512_cvtepi16_epi8(a.into()).simd_into(token)
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>) -> u8x64<Avx512> {
+            fn kernel(token: Avx512, a: u32x16<Avx512>) -> u8x64<Avx512> {
                 __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u16x32<Avx512>) -> u32x16<Avx512> {
-                __m512i::from(a).simd_into(token)
+            fn kernel(token: Avx512, a: u32x16<Avx512>) -> f32x16<Avx512> {
+                _mm512_cvtepu32_ps(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
-        mask16x32 {
-            val: (if val { 4294967295u64 } else { 0 }) as _,
+    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
+        mask32x16 {
+            val: (if val { 65535u64 } else { 0 }) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
+    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: [i16; 32usize]) -> mask16x32<Avx512> {
+            fn kernel(token: Avx512, val: [i32; 16usize]) -> mask32x16<Avx512> {
                 let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask16x32 {
-                    val: _mm512_movepi16_mask(lanes),
+                mask32x16 {
+                    val: _mm512_movepi32_mask(lanes),
                     simd: token,
                 }
             }
@@ -11793,190 +14582,190 @@ impl Simd for Avx512 {
         kernel(self, val)
     }
     #[inline(always)]
-    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
+    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: mask16x32<Avx512>) -> [i16; 32usize] {
-                let lanes = _mm512_movm_epi16(a.val);
+            fn kernel(token: Avx512, a: mask32x16<Avx512>) -> [i32; 16usize] {
+                let lanes = _mm512_movm_epi32(a.val);
                 crate::transmute::checked_transmute_copy(&lanes)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
-        mask16x32 {
-            val: (bits & 4294967295u64) as _,
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        mask32x16 {
+            val: (bits & 65535u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
-        u64::from((a).val) & 4294967295u64
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        u64::from((a).val) & 65535u64
     }
     #[inline(always)]
-    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 32usize,
+            index < 16usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            32usize
+            16usize
         );
         let bit = 1u64 << index;
         let bits = u64::from((a).val);
         let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask16x32 {
+        *a = mask32x16 {
             val: (bits) as _,
             simd: self,
         };
     }
     #[inline(always)]
-    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: ((u64::from((a).val) & u64::from((b).val)) & 4294967295u64) as _,
+    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: ((u64::from((a).val) | u64::from((b).val)) & 4294967295u64) as _,
+    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: ((u64::from((a).val) ^ u64::from((b).val)) & 4294967295u64) as _,
+    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: ((!u64::from((a).val)) & 4294967295u64) as _,
+    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: ((!u64::from((a).val)) & 65535u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn select_mask16x32(
+    fn select_mask32x16(
         self,
-        a: mask16x32<Self>,
-        b: mask16x32<Self>,
-        c: mask16x32<Self>,
-    ) -> mask16x32<Self> {
-        mask16x32 {
+        a: mask32x16<Self>,
+        b: mask32x16<Self>,
+        c: mask32x16<Self>,
+    ) -> mask32x16<Self> {
+        mask32x16 {
             val: (((u64::from((a).val) & u64::from((b).val))
                 | ((!u64::from((a).val)) & u64::from((c).val)))
-                & 4294967295u64) as _,
+                & 65535u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: (!u64::from(a.val ^ b.val) & 4294967295u64) as _,
+    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
             simd: self,
         }
     }
     #[inline(always)]
-    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
+    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
         bits != 0
     }
     #[inline(always)]
-    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
-        bits == 4294967295u64
+    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits == 65535u64
     }
     #[inline(always)]
-    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
-        bits != 4294967295u64
+    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
+        bits != 65535u64
     }
     #[inline(always)]
-    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let bits = u64::from((a).val) & 4294967295u64;
+    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let bits = u64::from((a).val) & 65535u64;
         bits == 0
     }
     #[inline(always)]
-    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
         let bits = u64::from(a.val);
         (
-            mask16x16 {
-                val: (bits & 65535u64) as _,
+            mask32x8 {
+                val: (bits & 255u64) as _,
                 simd: self,
             },
-            mask16x16 {
-                val: ((bits >> 16usize) & 65535u64) as _,
+            mask32x8 {
+                val: ((bits >> 8usize) & 255u64) as _,
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
+    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: i32) -> i32x16<Avx512> {
-                _mm512_set1_epi32(val).simd_into(token)
+            fn kernel(token: Avx512, val: f64) -> f64x8<Avx512> {
+                _mm512_set1_pd(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m512i, [i32; 16usize]>(&a.val.0)
+    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m512d, [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
-        crate::transmute::checked_cast_ref::<__m512i, [i32; 16usize]>(&a.val.0)
+    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
+        crate::transmute::checked_cast_ref::<__m512d, [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
-        crate::transmute::checked_cast_mut::<__m512i, [i32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
+        crate::transmute::checked_cast_mut::<__m512d, [f64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
+    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
-        i32x16 {
+    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i32x16<Avx512>,
-                b: i32x16<Avx512>,
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
                 shift: usize,
-            ) -> i32x16<Avx512> {
-                if shift >= 16usize {
+            ) -> f64x8<Avx512> {
+                if shift >= 8usize {
                     return b;
                 }
                 let idx = _mm512_add_epi8(
@@ -11986,14 +14775,14 @@ impl Simd for Avx512 {
                         25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
                         5, 4, 3, 2, 1, 0,
                     ),
-                    _mm512_set1_epi8((shift * 4usize) as i8),
+                    _mm512_set1_epi8((shift * 8usize) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_i32x16(a).val.0,
+                    token.cvt_to_bytes_f64x8(a).val.0,
                     idx,
-                    token.cvt_to_bytes_i32x16(b).val.0,
+                    token.cvt_to_bytes_f64x8(b).val.0,
                 );
-                token.cvt_from_bytes_i32x16(u8x64 {
+                token.cvt_from_bytes_f64x8(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -12002,136 +14791,140 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x16<const SHIFT: usize>(
+    fn slide_within_blocks_f64x8<const SHIFT: usize>(
         self,
-        a: i32x16<Self>,
-        b: i32x16<Self>,
-    ) -> i32x16<Self> {
+        a: f64x8<Self>,
+        b: f64x8<Self>,
+    ) -> f64x8<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 4usize {
+        if SHIFT >= 2usize {
             return b;
         }
-        let a = self.cvt_to_bytes_i32x16(a).val.0;
-        let b = self.cvt_to_bytes_i32x16(b).val.0;
-        let result = dyn_alignr_512(self, b, a, SHIFT * 4usize);
-        self.cvt_from_bytes_i32x16(u8x64 {
+        let a = self.cvt_to_bytes_f64x8(a).val.0;
+        let b = self.cvt_to_bytes_f64x8(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT * 8usize);
+        self.cvt_from_bytes_f64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_add_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_sub_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_mullo_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_sqrt_pd(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_and_si512(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_rcp14_pd(a.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_or_si512(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_add_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_xor_si512(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_sub_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        a ^ !0
-    }
-    #[inline(always)]
-    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, shift: u32) -> i32x16<Avx512> {
-                _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_mul_pd(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_sllv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_div_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, shift: u32) -> i32x16<Avx512> {
-                _mm512_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                let mask = _mm512_set1_pd(-0.0);
+                _mm512_or_pd(
+                    _mm512_and_pd(mask, b.into()),
+                    _mm512_andnot_pd(mask, a.into()),
+                )
+                .simd_into(token)
             }
         );
-        kernel(self, a, shift)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_srav_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                    simd: token,
+                }
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmpeq_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12139,12 +14932,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmplt_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12152,12 +14945,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmple_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12165,12 +14958,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmpge_epi32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12178,26 +14971,28 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmpgt_epi32_mask(a.into(), b.into()),
-                    simd: token,
-                }
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
+                    a.into(),
+                    _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11),
+                    b.into(),
+                )
+                .simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_permutex2var_epi32(
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
                     a.into(),
-                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15),
                     b.into(),
                 )
                 .simd_into(token)
@@ -12206,13 +15001,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_permutex2var_epi32(
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
                     a.into(),
-                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14),
                     b.into(),
                 )
                 .simd_into(token)
@@ -12221,252 +15016,283 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_permutex2var_pd(
+                    a.into(),
+                    _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15),
+                    b.into(),
+                )
+                .simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
+            ) -> (f64x8<Avx512>, f64x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Avx512,
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
+            ) -> (f64x8<Avx512>, f64x8<Avx512>) {
+                let a = a.into();
+                let b = b.into();
+                (
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
+                )
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_max_pd(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_permutex2var_epi32(
-                    a.into(),
-                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_min_pd(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_permutex2var_epi32(
-                    a.into(),
-                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
-                    b.into(),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: i32x16<Avx512>,
-                b: i32x16<Avx512>,
-            ) -> (i32x16<Avx512>, i32x16<Avx512>) {
-                let a = a.into();
-                let b = b.into();
-                (
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(
-                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                )
+            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: i32x16<Avx512>,
-                b: i32x16<Avx512>,
-            ) -> (i32x16<Avx512>, i32x16<Avx512>) {
-                let a = a.into();
-                let b = b.into();
-                (
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(
-                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(
-                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                )
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
+                c: f64x8<Avx512>,
+            ) -> f64x8<Avx512> {
+                _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
+    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask32x16<Avx512>,
-                b: i32x16<Avx512>,
-                c: i32x16<Avx512>,
-            ) -> i32x16<Avx512> {
-                _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+                a: f64x8<Avx512>,
+                b: f64x8<Avx512>,
+                c: f64x8<Avx512>,
+            ) -> f64x8<Avx512> {
+                _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_min_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>, b: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_max_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a)
     }
     #[inline(always)]
-    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
+    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>) -> (i32x8<Avx512>, i32x8<Avx512>) {
-                (
-                    _mm512_castsi512_si256(a.into()).simd_into(token),
-                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        a - self.trunc_f64x8(a)
+    }
+    #[inline(always)]
+    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>) -> i32x16<Avx512> {
-                _mm512_sub_epi32(_mm512_setzero_si512(), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
+                _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+                    .simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>) -> u8x64<Avx512> {
-                __m512i::from(a).simd_into(token)
+            fn kernel(
+                token: Avx512,
+                a: mask64x8<Avx512>,
+                b: f64x8<Avx512>,
+                c: f64x8<Avx512>,
+            ) -> f64x8<Avx512> {
+                _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
+    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>) -> u32x16<Avx512> {
-                __m512i::from(a).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> (f64x4<Avx512>, f64x4<Avx512>) {
+                (
+                    _mm512_castpd512_pd256(a.into()).simd_into(token),
+                    _mm512_extractf64x4_pd::<1>(a.into()).simd_into(token),
+                )
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
+    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: i32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_cvtepi32_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f32x16<Avx512> {
+                _mm512_castpd_ps(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+    fn splat_i64x8(self, val: i64) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: u32) -> u32x16<Avx512> {
-                _mm512_set1_epi32(val.cast_signed()).simd_into(token)
+            fn kernel(token: Avx512, val: i64) -> i64x8<Avx512> {
+                _mm512_set1_epi64(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
+    fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
+    fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
-        crate::transmute::checked_transmute_copy::<__m512i, [u32; 16usize]>(&a.val.0)
+    fn as_array_i64x8(self, a: i64x8<Self>) -> [i64; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
-        crate::transmute::checked_cast_ref::<__m512i, [u32; 16usize]>(&a.val.0)
+    fn as_array_ref_i64x8(self, a: &i64x8<Self>) -> &[i64; 8usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
-        crate::transmute::checked_cast_mut::<__m512i, [u32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_i64x8(self, a: &mut i64x8<Self>) -> &mut [i64; 8usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [i64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+    fn store_array_i64x8(self, a: i64x8<Self>, dest: &mut [i64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
-        u32x16 {
+    fn cvt_from_bytes_i64x8(self, a: u8x64<Self>) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn slide_i64x8<const SHIFT: usize>(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u32x16<Avx512>,
-                b: u32x16<Avx512>,
+                a: i64x8<Avx512>,
+                b: i64x8<Avx512>,
                 shift: usize,
-            ) -> u32x16<Avx512> {
-                if shift >= 16usize {
+            ) -> i64x8<Avx512> {
+                if shift >= 8usize {
                     return b;
                 }
                 let idx = _mm512_add_epi8(
@@ -12476,14 +15302,14 @@ impl Simd for Avx512 {
                         25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
                         5, 4, 3, 2, 1, 0,
                     ),
-                    _mm512_set1_epi8((shift * 4usize) as i8),
+                    _mm512_set1_epi8((shift * 8usize) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_u32x16(a).val.0,
+                    token.cvt_to_bytes_i64x8(a).val.0,
                     idx,
-                    token.cvt_to_bytes_u32x16(b).val.0,
+                    token.cvt_to_bytes_i64x8(b).val.0,
                 );
-                token.cvt_from_bytes_u32x16(u8x64 {
+                token.cvt_from_bytes_i64x8(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -12492,136 +15318,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x16<const SHIFT: usize>(
+    fn slide_within_blocks_i64x8<const SHIFT: usize>(
         self,
-        a: u32x16<Self>,
-        b: u32x16<Self>,
-    ) -> u32x16<Self> {
+        a: i64x8<Self>,
+        b: i64x8<Self>,
+    ) -> i64x8<Self> {
         if SHIFT == 0 {
             return a;
         }
-        if SHIFT >= 4usize {
+        if SHIFT >= 2usize {
             return b;
         }
-        let a = self.cvt_to_bytes_u32x16(a).val.0;
-        let b = self.cvt_to_bytes_u32x16(b).val.0;
-        let result = dyn_alignr_512(self, b, a, SHIFT * 4usize);
-        self.cvt_from_bytes_u32x16(u8x64 {
+        let a = self.cvt_to_bytes_i64x8(a).val.0;
+        let b = self.cvt_to_bytes_i64x8(b).val.0;
+        let result = dyn_alignr_512(self, b, a, SHIFT * 8usize);
+        self.cvt_from_bytes_i64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn add_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_add_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn sub_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_sub_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn mul_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_mullo_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_mullo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn and_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
                 _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn or_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
                 _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn xor_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
                 _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
+    fn not_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
         a ^ !0
     }
     #[inline(always)]
-    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+    fn shl_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, shift: u32) -> u32x16<Avx512> {
-                _mm512_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, shift: u32) -> i64x8<Avx512> {
+                _mm512_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn shlv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_sllv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+    fn shr_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, shift: u32) -> u32x16<Avx512> {
-                _mm512_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, shift: u32) -> i64x8<Avx512> {
+                _mm512_sra_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
         kernel(self, a, shift)
     }
     #[inline(always)]
-    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn shrv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_srlv_epi32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_srav_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+    fn simd_eq_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmpeq_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmpeq_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12629,12 +15455,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+    fn simd_lt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmplt_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmplt_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12642,12 +15468,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+    fn simd_le_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmple_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmple_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12655,12 +15481,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+    fn simd_ge_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmpge_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmpge_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12668,12 +15494,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+    fn simd_gt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> mask32x16<Avx512> {
-                mask32x16 {
-                    val: _mm512_cmpgt_epu32_mask(a.into(), b.into()),
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> mask64x8<Avx512> {
+                mask64x8 {
+                    val: _mm512_cmpgt_epi64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -12681,13 +15507,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn zip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_permutex2var_epi32(
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
-                    _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
+                    _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11),
                     b.into(),
                 )
                 .simd_into(token)
@@ -12696,13 +15522,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn zip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_permutex2var_epi32(
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
-                    _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31),
+                    _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15),
                     b.into(),
                 )
                 .simd_into(token)
@@ -12711,13 +15537,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn unzip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_permutex2var_epi32(
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
-                    _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30),
+                    _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14),
                     b.into(),
                 )
                 .simd_into(token)
@@ -12726,13 +15552,13 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn unzip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_permutex2var_epi32(
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
-                    _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31),
+                    _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15),
                     b.into(),
                 )
                 .simd_into(token)
@@ -12741,109 +15567,87 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+    fn interleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u32x16<Avx512>,
-                b: u32x16<Avx512>,
-            ) -> (u32x16<Avx512>, u32x16<Avx512>) {
+                a: i64x8<Avx512>,
+                b: i64x8<Avx512>,
+            ) -> (i64x8<Avx512>, i64x8<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(
-                            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
+                        .simd_into(token),
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+    fn deinterleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: u32x16<Avx512>,
-                b: u32x16<Avx512>,
-            ) -> (u32x16<Avx512>, u32x16<Avx512>) {
+                a: i64x8<Avx512>,
+                b: i64x8<Avx512>,
+            ) -> (i64x8<Avx512>, i64x8<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(
-                            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
-                    _mm512_permutex2var_epi32(
-                        a,
-                        _mm512_setr_epi32(
-                            1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-                        ),
-                        b,
-                    )
-                    .simd_into(token),
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
+                        .simd_into(token),
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
+                        .simd_into(token),
                 )
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
+    fn select_i64x8(self, a: mask64x8<Self>, b: i64x8<Self>, c: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: mask32x16<Avx512>,
-                b: u32x16<Avx512>,
-                c: u32x16<Avx512>,
-            ) -> u32x16<Avx512> {
-                _mm512_mask_blend_epi32(a.val, c.into(), b.into()).simd_into(token)
+                a: mask64x8<Avx512>,
+                b: i64x8<Avx512>,
+                c: i64x8<Avx512>,
+            ) -> i64x8<Avx512> {
+                _mm512_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn min_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_min_epu32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_min_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+    fn max_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, b: u32x16<Avx512>) -> u32x16<Avx512> {
-                _mm512_max_epu32(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>, b: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_max_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
+    fn split_i64x8(self, a: i64x8<Self>) -> (i64x4<Self>, i64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>) -> (u32x8<Avx512>, u32x8<Avx512>) {
+            fn kernel(token: Avx512, a: i64x8<Avx512>) -> (i64x4<Avx512>, i64x4<Avx512>) {
                 (
                     _mm512_castsi512_si256(a.into()).simd_into(token),
                     _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
@@ -12853,260 +15657,99 @@ impl Simd for Avx512 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, src: &[u32; 16usize]) -> u32x16<Avx512> {
-                let lanes: __m512i =
-                    crate::transmute::checked_transmute_copy::<[u32; 16usize], __m512i>(src);
-                _mm512_permutexvar_epi32(
-                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
-                    lanes,
-                )
-                .simd_into(token)
-            }
-        );
-        kernel(self, src)
-    }
-    #[inline(always)]
-    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>, dest: &mut [u32; 16usize]) -> () {
-                let lanes = _mm512_permutexvar_epi32(
-                    _mm512_setr_epi32(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15),
-                    a.into(),
-                );
-                crate::transmute::checked_transmute_store::<__m512i, [u32; 16usize]>(lanes, dest);
-            }
-        );
-        kernel(self, a, dest);
-    }
-    #[inline(always)]
-    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+    fn neg_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>) -> u8x64<Avx512> {
-                __m512i::from(a).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>) -> i64x8<Avx512> {
+                _mm512_sub_epi64(_mm512_setzero_si512(), a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
+    fn reinterpret_u8_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: u32x16<Avx512>) -> f32x16<Avx512> {
-                _mm512_cvtepu32_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: i64x8<Avx512>) -> u8x64<Avx512> {
+                __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
-        mask32x16 {
-            val: (if val { 65535u64 } else { 0 }) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, val: [i32; 16usize]) -> mask32x16<Avx512> {
-                let lanes = crate::transmute::checked_transmute_copy(&val);
-                mask32x16 {
-                    val: _mm512_movepi32_mask(lanes),
-                    simd: token,
-                }
-            }
-        );
-        kernel(self, val)
-    }
-    #[inline(always)]
-    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
+    fn reinterpret_u32_i64x8(self, a: i64x8<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: mask32x16<Avx512>) -> [i32; 16usize] {
-                let lanes = _mm512_movm_epi32(a.val);
-                crate::transmute::checked_transmute_copy(&lanes)
+            fn kernel(token: Avx512, a: i64x8<Avx512>) -> u32x16<Avx512> {
+                __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
-        mask32x16 {
-            val: (bits & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
-        u64::from((a).val) & 65535u64
-    }
-    #[inline(always)]
-    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let bit = 1u64 << index;
-        let bits = u64::from((a).val);
-        let bits = if value { bits | bit } else { bits & !bit };
-        *a = mask32x16 {
-            val: (bits) as _,
-            simd: self,
-        };
-    }
-    #[inline(always)]
-    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: ((u64::from((a).val) & u64::from((b).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: ((u64::from((a).val) | u64::from((b).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: ((u64::from((a).val) ^ u64::from((b).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: ((!u64::from((a).val)) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn select_mask32x16(
-        self,
-        a: mask32x16<Self>,
-        b: mask32x16<Self>,
-        c: mask32x16<Self>,
-    ) -> mask32x16<Self> {
-        mask32x16 {
-            val: (((u64::from((a).val) & u64::from((b).val))
-                | ((!u64::from((a).val)) & u64::from((c).val)))
-                & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: (!u64::from(a.val ^ b.val) & 65535u64) as _,
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits != 0
-    }
-    #[inline(always)]
-    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits == 65535u64
-    }
-    #[inline(always)]
-    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits != 65535u64
-    }
-    #[inline(always)]
-    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let bits = u64::from((a).val) & 65535u64;
-        bits == 0
-    }
-    #[inline(always)]
-    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
-        let bits = u64::from(a.val);
-        (
-            mask32x8 {
-                val: (bits & 255u64) as _,
-                simd: self,
-            },
-            mask32x8 {
-                val: ((bits >> 8usize) & 255u64) as _,
-                simd: self,
-            },
-        )
-    }
-    #[inline(always)]
-    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
+    fn splat_u64x8(self, val: u64) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, val: f64) -> f64x8<Avx512> {
-                _mm512_set1_pd(val).simd_into(token)
+            fn kernel(token: Avx512, val: u64) -> u64x8<Avx512> {
+                _mm512_set1_epi64(val.cast_signed()).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
-    #[inline(always)]
-    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    #[inline(always)]
+    fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
-        crate::transmute::checked_transmute_copy::<__m512d, [f64; 8usize]>(&a.val.0)
+    fn as_array_u64x8(self, a: u64x8<Self>) -> [u64; 8usize] {
+        crate::transmute::checked_transmute_copy::<__m512i, [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
-        crate::transmute::checked_cast_ref::<__m512d, [f64; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x8(self, a: &u64x8<Self>) -> &[u64; 8usize] {
+        crate::transmute::checked_cast_ref::<__m512i, [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
-        crate::transmute::checked_cast_mut::<__m512d, [f64; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x8(self, a: &mut u64x8<Self>) -> &mut [u64; 8usize] {
+        crate::transmute::checked_cast_mut::<__m512i, [u64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+    fn store_array_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn cvt_from_bytes_u64x8(self, a: u8x64<Self>) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn slide_u64x8<const SHIFT: usize>(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f64x8<Avx512>,
-                b: f64x8<Avx512>,
+                a: u64x8<Avx512>,
+                b: u64x8<Avx512>,
                 shift: usize,
-            ) -> f64x8<Avx512> {
+            ) -> u64x8<Avx512> {
                 if shift >= 8usize {
                     return b;
                 }
@@ -13120,11 +15763,11 @@ impl Simd for Avx512 {
                     _mm512_set1_epi8((shift * 8usize) as i8),
                 );
                 let result = _mm512_permutex2var_epi8(
-                    token.cvt_to_bytes_f64x8(a).val.0,
+                    token.cvt_to_bytes_u64x8(a).val.0,
                     idx,
-                    token.cvt_to_bytes_f64x8(b).val.0,
+                    token.cvt_to_bytes_u64x8(b).val.0,
                 );
-                token.cvt_from_bytes_f64x8(u8x64 {
+                token.cvt_from_bytes_u64x8(u8x64 {
                     val: crate::support::Aligned512(result),
                     simd: token,
                 })
@@ -13133,127 +15776,136 @@ impl Simd for Avx512 {
         kernel(self, a, b, SHIFT)
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x8<const SHIFT: usize>(
         self,
-        a: f64x8<Self>,
-        b: f64x8<Self>,
-    ) -> f64x8<Self> {
+        a: u64x8<Self>,
+        b: u64x8<Self>,
+    ) -> u64x8<Self> {
         if SHIFT == 0 {
             return a;
         }
         if SHIFT >= 2usize {
             return b;
         }
-        let a = self.cvt_to_bytes_f64x8(a).val.0;
-        let b = self.cvt_to_bytes_f64x8(b).val.0;
+        let a = self.cvt_to_bytes_u64x8(a).val.0;
+        let b = self.cvt_to_bytes_u64x8(b).val.0;
         let result = dyn_alignr_512(self, b, a, SHIFT * 8usize);
-        self.cvt_from_bytes_f64x8(u8x64 {
+        self.cvt_from_bytes_u64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn add_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_andnot_pd(_mm512_set1_pd(-0.0), a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn sub_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_xor_pd(a.into(), _mm512_set1_pd(-0.0)).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn mul_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_sqrt_pd(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_mullo_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn and_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_rcp14_pd(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_and_si512(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn or_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_add_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_or_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn xor_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_sub_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_xor_si512(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn not_u64x8(self, a: u64x8<Self>) -> u64x8<Self> {
+        a ^ !0
+    }
+    #[inline(always)]
+    fn shl_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_mul_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, shift: u32) -> u64x8<Avx512> {
+                _mm512_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn shlv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_div_pd(a.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_sllv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn shr_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                let mask = _mm512_set1_pd(-0.0);
-                _mm512_or_pd(
-                    _mm512_and_pd(mask, b.into()),
-                    _mm512_andnot_pd(mask, a.into()),
-                )
-                .simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, shift: u32) -> u64x8<Avx512> {
+                _mm512_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_srlv_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+    fn simd_eq_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> mask64x8<Avx512> {
                 mask64x8 {
-                    val: _mm512_cmp_pd_mask::<0i32>(a.into(), b.into()),
+                    val: _mm512_cmpeq_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -13261,12 +15913,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+    fn simd_lt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> mask64x8<Avx512> {
                 mask64x8 {
-                    val: _mm512_cmp_pd_mask::<17i32>(a.into(), b.into()),
+                    val: _mm512_cmplt_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -13274,12 +15926,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+    fn simd_le_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> mask64x8<Avx512> {
                 mask64x8 {
-                    val: _mm512_cmp_pd_mask::<18i32>(a.into(), b.into()),
+                    val: _mm512_cmple_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -13287,12 +15939,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+    fn simd_ge_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> mask64x8<Avx512> {
                 mask64x8 {
-                    val: _mm512_cmp_pd_mask::<29i32>(a.into(), b.into()),
+                    val: _mm512_cmpge_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -13300,12 +15952,12 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+    fn simd_gt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> mask64x8<Avx512> {
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> mask64x8<Avx512> {
                 mask64x8 {
-                    val: _mm512_cmp_pd_mask::<30i32>(a.into(), b.into()),
+                    val: _mm512_cmpgt_epu64_mask(a.into(), b.into()),
                     simd: token,
                 }
             }
@@ -13313,11 +15965,11 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn zip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_permutex2var_pd(
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
                     _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11),
                     b.into(),
@@ -13328,11 +15980,11 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn zip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_permutex2var_pd(
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
                     _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15),
                     b.into(),
@@ -13343,11 +15995,11 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn unzip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_permutex2var_pd(
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
                     _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14),
                     b.into(),
@@ -13358,11 +16010,11 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn unzip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_permutex2var_pd(
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_permutex2var_epi64(
                     a.into(),
                     _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15),
                     b.into(),
@@ -13373,20 +16025,20 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+    fn interleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f64x8<Avx512>,
-                b: f64x8<Avx512>,
-            ) -> (f64x8<Avx512>, f64x8<Avx512>) {
+                a: u64x8<Avx512>,
+                b: u64x8<Avx512>,
+            ) -> (u64x8<Avx512>, u64x8<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11), b)
                         .simd_into(token),
-                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15), b)
                         .simd_into(token),
                 )
             }
@@ -13394,20 +16046,20 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+    fn deinterleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f64x8<Avx512>,
-                b: f64x8<Avx512>,
-            ) -> (f64x8<Avx512>, f64x8<Avx512>) {
+                a: u64x8<Avx512>,
+                b: u64x8<Avx512>,
+            ) -> (u64x8<Avx512>, u64x8<Avx512>) {
                 let a = a.into();
                 let b = b.into();
                 (
-                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), b)
                         .simd_into(token),
-                    _mm512_permutex2var_pd(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
+                    _mm512_permutex2var_epi64(a, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), b)
                         .simd_into(token),
                 )
             }
@@ -13415,157 +16067,94 @@ impl Simd for Avx512 {
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_max_pd(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_min_pd(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_range_pd::<5i32>(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>, b: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_range_pd::<4i32>(a.into(), b.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b)
-    }
-    #[inline(always)]
-    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: f64x8<Avx512>,
-                b: f64x8<Avx512>,
-                c: f64x8<Avx512>,
-            ) -> f64x8<Avx512> {
-                _mm512_fmadd_pd(a.into(), b.into(), c.into()).simd_into(token)
-            }
-        );
-        kernel(self, a, b, c)
-    }
-    #[inline(always)]
-    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+    fn select_u64x8(self, a: mask64x8<Self>, b: u64x8<Self>, c: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Avx512,
-                a: f64x8<Avx512>,
-                b: f64x8<Avx512>,
-                c: f64x8<Avx512>,
-            ) -> f64x8<Avx512> {
-                _mm512_fmsub_pd(a.into(), b.into(), c.into()).simd_into(token)
+                a: mask64x8<Avx512>,
+                b: u64x8<Avx512>,
+                c: u64x8<Avx512>,
+            ) -> u64x8<Avx512> {
+                _mm512_mask_blend_epi64(a.val, c.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn min_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_min_epu64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_roundscale_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, b: u64x8<Avx512>) -> u64x8<Avx512> {
+                _mm512_max_epu64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>) {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_roundscale_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
-                    .simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>) -> (u64x4<Avx512>, u64x4<Avx512>) {
+                (
+                    _mm512_castsi512_si256(a.into()).simd_into(token),
+                    _mm512_extracti64x4_epi64::<1>(a.into()).simd_into(token),
+                )
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        a - self.trunc_f64x8(a)
-    }
-    #[inline(always)]
-    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+    fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f64x8<Avx512> {
-                _mm512_roundscale_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into())
+            fn kernel(token: Avx512, src: &[u64; 8usize]) -> u64x8<Avx512> {
+                let lanes: __m512i =
+                    crate::transmute::checked_transmute_copy::<[u64; 8usize], __m512i>(src);
+                _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), lanes)
                     .simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, src)
     }
     #[inline(always)]
-    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+    fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Avx512,
-                a: mask64x8<Avx512>,
-                b: f64x8<Avx512>,
-                c: f64x8<Avx512>,
-            ) -> f64x8<Avx512> {
-                _mm512_mask_blend_pd(a.val, c.into(), b.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>, dest: &mut [u64; 8usize]) -> () {
+                let lanes =
+                    _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7), a.into());
+                crate::transmute::checked_transmute_store::<__m512i, [u64; 8usize]>(lanes, dest);
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, dest);
     }
     #[inline(always)]
-    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> (f64x4<Avx512>, f64x4<Avx512>) {
-                (
-                    _mm512_castpd512_pd256(a.into()).simd_into(token),
-                    _mm512_extractf64x4_pd::<1>(a.into()).simd_into(token),
-                )
+            fn kernel(token: Avx512, a: u64x8<Avx512>) -> u8x64<Avx512> {
+                __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
+    fn reinterpret_u32_u64x8(self, a: u64x8<Self>) -> u32x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Avx512, a: f64x8<Avx512>) -> f32x16<Avx512> {
-                _mm512_castpd_ps(a.into()).simd_into(token)
+            fn kernel(token: Avx512, a: u64x8<Avx512>) -> u32x16<Avx512> {
+                __m512i::from(a).simd_into(token)
             }
         );
         kernel(self, a)
@@ -14008,6 +16597,36 @@ impl<S: Simd> From<f64x8<S>> for __m512d {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
+impl<S: Simd> SimdFrom<__m512i, S> for i64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i64x8<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: i64x8<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__m512i, S> for u64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m512i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u64x8<S>> for __m512i {
+    #[inline(always)]
+    fn from(value: u64x8<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
 impl<S: Simd> SimdFrom<__mmask8, S> for mask64x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __mmask8) -> Self {
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index 1024b172a..f1877087d 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -6,9 +6,9 @@
 use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 use core::ops::*;
 #[cfg(all(feature = "libm", not(feature = "std")))]
@@ -98,6 +98,8 @@ impl ArchTypes for Fallback {
     type u32x4 = crate::support::Aligned128<[u32; 4usize]>;
     type mask32x4 = crate::support::Aligned128<[i32; 4usize]>;
     type f64x2 = crate::support::Aligned128<[f64; 2usize]>;
+    type i64x2 = crate::support::Aligned128<[i64; 2usize]>;
+    type u64x2 = crate::support::Aligned128<[u64; 2usize]>;
     type mask64x2 = crate::support::Aligned128<[i64; 2usize]>;
     type f32x8 = crate::support::Aligned256<[f32; 8usize]>;
     type i8x32 = crate::support::Aligned256<[i8; 32usize]>;
@@ -110,6 +112,8 @@ impl ArchTypes for Fallback {
     type u32x8 = crate::support::Aligned256<[u32; 8usize]>;
     type mask32x8 = crate::support::Aligned256<[i32; 8usize]>;
     type f64x4 = crate::support::Aligned256<[f64; 4usize]>;
+    type i64x4 = crate::support::Aligned256<[i64; 4usize]>;
+    type u64x4 = crate::support::Aligned256<[u64; 4usize]>;
     type mask64x4 = crate::support::Aligned256<[i64; 4usize]>;
     type f32x16 = crate::support::Aligned512<[f32; 16usize]>;
     type i8x64 = crate::support::Aligned512<[i8; 64usize]>;
@@ -122,6 +126,8 @@ impl ArchTypes for Fallback {
     type u32x16 = crate::support::Aligned512<[u32; 16usize]>;
     type mask32x16 = crate::support::Aligned512<[i32; 16usize]>;
     type f64x8 = crate::support::Aligned512<[f64; 8usize]>;
+    type i64x8 = crate::support::Aligned512<[i64; 8usize]>;
+    type u64x8 = crate::support::Aligned512<[u64; 8usize]>;
     type mask64x8 = crate::support::Aligned512<[i64; 8usize]>;
 }
 impl Simd for Fallback {
@@ -133,6 +139,8 @@ impl Simd for Fallback {
     type i16s = i16x8<Self>;
     type u32s = u32x4<Self>;
     type i32s = i32x4<Self>;
+    type u64s = u64x2<Self>;
+    type i64s = i64x2<Self>;
     type mask8s = mask8x16<Self>;
     type mask16s = mask16x8<Self>;
     type mask32s = mask32x4<Self>;
@@ -1811,8 +1819,24 @@ impl Simd for Fallback {
     }
     #[inline(always)]
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
-        let lanes: [i8; 16usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        let lanes: [i8; 16usize] = [
+            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 8usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 9usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 10usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 11usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 12usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 13usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 14usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 15usize) & 1) != 0 { !0 } else { 0 },
+        ];
         lanes.simd_into(self)
     }
     #[inline(always)]
@@ -2979,8 +3003,16 @@ impl Simd for Fallback {
     }
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
-        let lanes: [i16; 8usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        let lanes: [i16; 8usize] = [
+            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 },
+        ];
         lanes.simd_into(self)
     }
     #[inline(always)]
@@ -3839,8 +3871,12 @@ impl Simd for Fallback {
     }
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
-        let lanes: [i32; 4usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        let lanes: [i32; 4usize] = [
+            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 },
+        ];
         lanes.simd_into(self)
     }
     #[inline(always)]
@@ -4235,14 +4271,518 @@ impl Simd for Fallback {
         .simd_into(self)
     }
     #[inline(always)]
-    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
-        let mut result = [0.0; 4usize];
+    fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
+        let mut result = [0.0; 4usize];
+        result[0..2usize].copy_from_slice(&a.val.0);
+        result[2usize..4usize].copy_from_slice(&b.val.0);
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
+        a.bitcast()
+    }
+    #[inline(always)]
+    fn splat_i64x2(self, val: i64) -> i64x2<Self> {
+        [val; 2usize].simd_into(self)
+    }
+    #[inline(always)]
+    fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::support::Aligned128(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::support::Aligned128(*val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x2(self, a: i64x2<Self>) -> [i64; 2usize] {
+        a.val.0
+    }
+    #[inline(always)]
+    fn as_array_ref_i64x2(self, a: &i64x2<Self>) -> &[i64; 2usize] {
+        &a.val.0
+    }
+    #[inline(always)]
+    fn as_array_mut_i64x2(self, a: &mut i64x2<Self>) -> &mut [i64; 2usize] {
+        &mut a.val.0
+    }
+    #[inline(always)]
+    fn store_array_i64x2(self, a: i64x2<Self>, dest: &mut [i64; 2usize]) -> () {
+        *dest = a.val.0;
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x2(self, a: u8x16<Self>) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x2<const SHIFT: usize>(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let mut dest = [Default::default(); 2usize];
+        dest[..2usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
+        dest[2usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
+        dest.simd_into(self)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x2<const SHIFT: usize>(
+        self,
+        a: i64x2<Self>,
+        b: i64x2<Self>,
+    ) -> i64x2<Self> {
+        self.slide_i64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::wrapping_add(a[0usize], b[0usize]),
+            i64::wrapping_add(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn sub_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::wrapping_sub(a[0usize], b[0usize]),
+            i64::wrapping_sub(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn mul_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::wrapping_mul(a[0usize], b[0usize]),
+            i64::wrapping_mul(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn and_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::bitand(a[0usize], &b[0usize]),
+            i64::bitand(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn or_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::bitor(a[0usize], &b[0usize]),
+            i64::bitor(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn xor_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::bitxor(a[0usize], &b[0usize]),
+            i64::bitxor(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn not_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
+        [i64::not(a[0usize]), i64::not(a[1usize])].simd_into(self)
+    }
+    #[inline(always)]
+    fn shl_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        [i64::shl(a[0usize], shift), i64::shl(a[1usize], shift)].simd_into(self)
+    }
+    #[inline(always)]
+    fn shlv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::shl(a[0usize], &b[0usize]),
+            i64::shl(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        [i64::shr(a[0usize], shift), i64::shr(a[1usize], shift)].simd_into(self)
+    }
+    #[inline(always)]
+    fn shrv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::shr(a[0usize], &b[0usize]),
+            i64::shr(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        [
+            -(i64::eq(&a[0usize], &b[0usize]) as i64),
+            -(i64::eq(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_lt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        [
+            -(i64::lt(&a[0usize], &b[0usize]) as i64),
+            -(i64::lt(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_le_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        [
+            -(i64::le(&a[0usize], &b[0usize]) as i64),
+            -(i64::le(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_ge_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        [
+            -(i64::ge(&a[0usize], &b[0usize]) as i64),
+            -(i64::ge(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_gt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        [
+            -(i64::gt(&a[0usize], &b[0usize]) as i64),
+            -(i64::gt(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn zip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [a[0usize], b[0usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn zip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [a[1usize], b[1usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn unzip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [a[0usize], b[0usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn unzip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [a[1usize], b[1usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn interleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn select_i64x2(self, a: mask64x2<Self>, b: i64x2<Self>, c: i64x2<Self>) -> i64x2<Self> {
+        [
+            if a.val.0[0usize] != 0 {
+                b[0usize]
+            } else {
+                c[0usize]
+            },
+            if a.val.0[1usize] != 0 {
+                b[1usize]
+            } else {
+                c[1usize]
+            },
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn min_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::min(a[0usize], b[0usize]),
+            i64::min(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn max_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        [
+            i64::max(a[0usize], b[0usize]),
+            i64::max(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn combine_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x4<Self> {
+        let mut result = [0; 4usize];
+        result[0..2usize].copy_from_slice(&a.val.0);
+        result[2usize..4usize].copy_from_slice(&b.val.0);
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn neg_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
+        [i64::neg(a[0usize]), i64::neg(a[1usize])].simd_into(self)
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        a.bitcast()
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x2(self, a: i64x2<Self>) -> u32x4<Self> {
+        a.bitcast()
+    }
+    #[inline(always)]
+    fn splat_u64x2(self, val: u64) -> u64x2<Self> {
+        [val; 2usize].simd_into(self)
+    }
+    #[inline(always)]
+    fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
+            val: crate::support::Aligned128(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
+            val: crate::support::Aligned128(*val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u64x2(self, a: u64x2<Self>) -> [u64; 2usize] {
+        a.val.0
+    }
+    #[inline(always)]
+    fn as_array_ref_u64x2(self, a: &u64x2<Self>) -> &[u64; 2usize] {
+        &a.val.0
+    }
+    #[inline(always)]
+    fn as_array_mut_u64x2(self, a: &mut u64x2<Self>) -> &mut [u64; 2usize] {
+        &mut a.val.0
+    }
+    #[inline(always)]
+    fn store_array_u64x2(self, a: u64x2<Self>, dest: &mut [u64; 2usize]) -> () {
+        *dest = a.val.0;
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u64x2(self, a: u8x16<Self>) -> u64x2<Self> {
+        u64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u64x2<const SHIFT: usize>(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let mut dest = [Default::default(); 2usize];
+        dest[..2usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
+        dest[2usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
+        dest.simd_into(self)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u64x2<const SHIFT: usize>(
+        self,
+        a: u64x2<Self>,
+        b: u64x2<Self>,
+    ) -> u64x2<Self> {
+        self.slide_u64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::wrapping_add(a[0usize], b[0usize]),
+            u64::wrapping_add(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn sub_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::wrapping_sub(a[0usize], b[0usize]),
+            u64::wrapping_sub(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn mul_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::wrapping_mul(a[0usize], b[0usize]),
+            u64::wrapping_mul(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn and_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::bitand(a[0usize], &b[0usize]),
+            u64::bitand(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn or_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::bitor(a[0usize], &b[0usize]),
+            u64::bitor(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn xor_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::bitxor(a[0usize], &b[0usize]),
+            u64::bitxor(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn not_u64x2(self, a: u64x2<Self>) -> u64x2<Self> {
+        [u64::not(a[0usize]), u64::not(a[1usize])].simd_into(self)
+    }
+    #[inline(always)]
+    fn shl_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        [u64::shl(a[0usize], shift), u64::shl(a[1usize], shift)].simd_into(self)
+    }
+    #[inline(always)]
+    fn shlv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::shl(a[0usize], &b[0usize]),
+            u64::shl(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        [u64::shr(a[0usize], shift), u64::shr(a[1usize], shift)].simd_into(self)
+    }
+    #[inline(always)]
+    fn shrv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::shr(a[0usize], &b[0usize]),
+            u64::shr(a[1usize], &b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        [
+            -(u64::eq(&a[0usize], &b[0usize]) as i64),
+            -(u64::eq(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_lt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        [
+            -(u64::lt(&a[0usize], &b[0usize]) as i64),
+            -(u64::lt(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_le_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        [
+            -(u64::le(&a[0usize], &b[0usize]) as i64),
+            -(u64::le(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_ge_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        [
+            -(u64::ge(&a[0usize], &b[0usize]) as i64),
+            -(u64::ge(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_gt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        [
+            -(u64::gt(&a[0usize], &b[0usize]) as i64),
+            -(u64::gt(&a[1usize], &b[1usize]) as i64),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn zip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [a[0usize], b[0usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn zip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [a[1usize], b[1usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn unzip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [a[0usize], b[0usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn unzip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [a[1usize], b[1usize]].simd_into(self)
+    }
+    #[inline(always)]
+    fn interleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b))
+    }
+    #[inline(always)]
+    fn select_u64x2(self, a: mask64x2<Self>, b: u64x2<Self>, c: u64x2<Self>) -> u64x2<Self> {
+        [
+            if a.val.0[0usize] != 0 {
+                b[0usize]
+            } else {
+                c[0usize]
+            },
+            if a.val.0[1usize] != 0 {
+                b[1usize]
+            } else {
+                c[1usize]
+            },
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn min_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::min(a[0usize], b[0usize]),
+            u64::min(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn max_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        [
+            u64::max(a[0usize], b[0usize]),
+            u64::max(a[1usize], b[1usize]),
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn combine_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x4<Self> {
+        let mut result = [0; 4usize];
         result[0..2usize].copy_from_slice(&a.val.0);
         result[2usize..4usize].copy_from_slice(&b.val.0);
         result.simd_into(self)
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
+    fn reinterpret_u8_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        a.bitcast()
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x2(self, a: u64x2<Self>) -> u32x4<Self> {
         a.bitcast()
     }
     #[inline(always)]
@@ -4263,8 +4803,10 @@ impl Simd for Fallback {
     }
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
-        let lanes: [i64; 2usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        let lanes: [i64; 2usize] = [
+            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
+            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
+        ];
         lanes.simd_into(self)
     }
     #[inline(always)]
@@ -6981,11 +7523,534 @@ impl Simd for Fallback {
         (b0.simd_into(self), b1.simd_into(self))
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f32x4(
-            self.reinterpret_f32_f64x2(a0),
-            self.reinterpret_f32_f64x2(a1),
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f32x4(
+            self.reinterpret_f32_f64x2(a0),
+            self.reinterpret_f32_f64x2(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_i64x4(self, val: i64) -> i64x4<Self> {
+        let half = self.splat_i64x2(val);
+        self.combine_i64x2(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::support::Aligned256(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::support::Aligned256(*val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x4(self, a: i64x4<Self>) -> [i64; 4usize] {
+        a.val.0
+    }
+    #[inline(always)]
+    fn as_array_ref_i64x4(self, a: &i64x4<Self>) -> &[i64; 4usize] {
+        &a.val.0
+    }
+    #[inline(always)]
+    fn as_array_mut_i64x4(self, a: &mut i64x4<Self>) -> &mut [i64; 4usize] {
+        &mut a.val.0
+    }
+    #[inline(always)]
+    fn store_array_i64x4(self, a: i64x4<Self>, dest: &mut [i64; 4usize]) -> () {
+        *dest = a.val.0;
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x4(self, a: u8x32<Self>) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x4<const SHIFT: usize>(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let mut dest = [Default::default(); 4usize];
+        dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
+        dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
+        dest.simd_into(self)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x4<const SHIFT: usize>(
+        self,
+        a: i64x4<Self>,
+        b: i64x4<Self>,
+    ) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(
+            self.slide_within_blocks_i64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x2::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1))
+    }
+    #[inline(always)]
+    fn shl_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, _) = self.split_i64x4(a);
+        let (b0, _) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (_, a1) = self.split_i64x4(a);
+        let (_, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_lo = self.zip_low_i64x2(a0, b0);
+        let lo_hi = self.zip_high_i64x2(a0, b0);
+        let hi_lo = self.zip_low_i64x2(a1, b1);
+        let hi_hi = self.zip_high_i64x2(a1, b1);
+        (
+            self.combine_i64x2(lo_lo, lo_hi),
+            self.combine_i64x2(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_even = self.unzip_low_i64x2(a0, a1);
+        let lo_odd = self.unzip_high_i64x2(a0, a1);
+        let hi_even = self.unzip_low_i64x2(b0, b1);
+        let hi_odd = self.unzip_high_i64x2(b0, b1);
+        (
+            self.combine_i64x2(lo_even, hi_even),
+            self.combine_i64x2(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i64x4(self, a: mask64x4<Self>, b: i64x4<Self>, c: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let (c0, c1) = self.split_i64x4(c);
+        self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x8<Self> {
+        let mut result = [0; 8usize];
+        result[0..4usize].copy_from_slice(&a.val.0);
+        result[4usize..8usize].copy_from_slice(&b.val.0);
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn split_i64x4(self, a: i64x4<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        let mut b0 = [0; 2usize];
+        let mut b1 = [0; 2usize];
+        b0.copy_from_slice(&a.val.0[0..2usize]);
+        b1.copy_from_slice(&a.val.0[2usize..4usize]);
+        (b0.simd_into(self), b1.simd_into(self))
+    }
+    #[inline(always)]
+    fn neg_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x4(self, a: i64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_i64x2(a0),
+            self.reinterpret_u32_i64x2(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u64x4(self, val: u64) -> u64x4<Self> {
+        let half = self.splat_u64x2(val);
+        self.combine_u64x2(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
+            val: crate::support::Aligned256(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
+            val: crate::support::Aligned256(*val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u64x4(self, a: u64x4<Self>) -> [u64; 4usize] {
+        a.val.0
+    }
+    #[inline(always)]
+    fn as_array_ref_u64x4(self, a: &u64x4<Self>) -> &[u64; 4usize] {
+        &a.val.0
+    }
+    #[inline(always)]
+    fn as_array_mut_u64x4(self, a: &mut u64x4<Self>) -> &mut [u64; 4usize] {
+        &mut a.val.0
+    }
+    #[inline(always)]
+    fn store_array_u64x4(self, a: u64x4<Self>, dest: &mut [u64; 4usize]) -> () {
+        *dest = a.val.0;
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u64x4(self, a: u8x32<Self>) -> u64x4<Self> {
+        u64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u64x4<const SHIFT: usize>(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let mut dest = [Default::default(); 4usize];
+        dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
+        dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
+        dest.simd_into(self)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u64x4<const SHIFT: usize>(
+        self,
+        a: u64x4<Self>,
+        b: u64x4<Self>,
+    ) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(
+            self.slide_within_blocks_u64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x2::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn and_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn or_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u64x4(self, a: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1))
+    }
+    #[inline(always)]
+    fn shl_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, _) = self.split_u64x4(a);
+        let (b0, _) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (_, a1) = self.split_u64x4(a);
+        let (_, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_lo = self.zip_low_u64x2(a0, b0);
+        let lo_hi = self.zip_high_u64x2(a0, b0);
+        let hi_lo = self.zip_low_u64x2(a1, b1);
+        let hi_hi = self.zip_high_u64x2(a1, b1);
+        (
+            self.combine_u64x2(lo_lo, lo_hi),
+            self.combine_u64x2(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_even = self.unzip_low_u64x2(a0, a1);
+        let lo_odd = self.unzip_high_u64x2(a0, a1);
+        let hi_even = self.unzip_low_u64x2(b0, b1);
+        let hi_odd = self.unzip_high_u64x2(b0, b1);
+        (
+            self.combine_u64x2(lo_even, hi_even),
+            self.combine_u64x2(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u64x4(self, a: mask64x4<Self>, b: u64x4<Self>, c: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let (c0, c1) = self.split_u64x4(c);
+        self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x8<Self> {
+        let mut result = [0; 8usize];
+        result[0..4usize].copy_from_slice(&a.val.0);
+        result[4usize..8usize].copy_from_slice(&b.val.0);
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn split_u64x4(self, a: u64x4<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        let mut b0 = [0; 2usize];
+        let mut b1 = [0; 2usize];
+        b0.copy_from_slice(&a.val.0[0..2usize]);
+        b1.copy_from_slice(&a.val.0[2usize..4usize]);
+        (b0.simd_into(self), b1.simd_into(self))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x4(self, a: u64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u64x2(a0),
+            self.reinterpret_u32_u64x2(a1),
         )
     }
     #[inline(always)]
@@ -9840,6 +10905,535 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn splat_i64x8(self, val: i64) -> i64x8<Self> {
+        let half = self.splat_i64x4(val);
+        self.combine_i64x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
+            val: crate::support::Aligned512(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
+            val: crate::support::Aligned512(*val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x8(self, a: i64x8<Self>) -> [i64; 8usize] {
+        a.val.0
+    }
+    #[inline(always)]
+    fn as_array_ref_i64x8(self, a: &i64x8<Self>) -> &[i64; 8usize] {
+        &a.val.0
+    }
+    #[inline(always)]
+    fn as_array_mut_i64x8(self, a: &mut i64x8<Self>) -> &mut [i64; 8usize] {
+        &mut a.val.0
+    }
+    #[inline(always)]
+    fn store_array_i64x8(self, a: i64x8<Self>, dest: &mut [i64; 8usize]) -> () {
+        *dest = a.val.0;
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x8(self, a: u8x64<Self>) -> i64x8<Self> {
+        i64x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x8<const SHIFT: usize>(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let mut dest = [Default::default(); 8usize];
+        dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
+        dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
+        dest.simd_into(self)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x8<const SHIFT: usize>(
+        self,
+        a: i64x8<Self>,
+        b: i64x8<Self>,
+    ) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(
+            self.slide_within_blocks_i64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x4::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1))
+    }
+    #[inline(always)]
+    fn shl_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, _) = self.split_i64x8(a);
+        let (b0, _) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (_, a1) = self.split_i64x8(a);
+        let (_, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_lo = self.zip_low_i64x4(a0, b0);
+        let lo_hi = self.zip_high_i64x4(a0, b0);
+        let hi_lo = self.zip_low_i64x4(a1, b1);
+        let hi_hi = self.zip_high_i64x4(a1, b1);
+        (
+            self.combine_i64x4(lo_lo, lo_hi),
+            self.combine_i64x4(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_even = self.unzip_low_i64x4(a0, a1);
+        let lo_odd = self.unzip_high_i64x4(a0, a1);
+        let hi_even = self.unzip_low_i64x4(b0, b1);
+        let hi_odd = self.unzip_high_i64x4(b0, b1);
+        (
+            self.combine_i64x4(lo_even, hi_even),
+            self.combine_i64x4(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i64x8(self, a: mask64x8<Self>, b: i64x8<Self>, c: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let (c0, c1) = self.split_i64x8(c);
+        self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn split_i64x8(self, a: i64x8<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let mut b0 = [0; 4usize];
+        let mut b1 = [0; 4usize];
+        b0.copy_from_slice(&a.val.0[0..4usize]);
+        b1.copy_from_slice(&a.val.0[4usize..8usize]);
+        (b0.simd_into(self), b1.simd_into(self))
+    }
+    #[inline(always)]
+    fn neg_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x8(self, a: i64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i64x4(a0),
+            self.reinterpret_u32_i64x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u64x8(self, val: u64) -> u64x8<Self> {
+        let half = self.splat_u64x4(val);
+        self.combine_u64x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
+            val: crate::support::Aligned512(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
+            val: crate::support::Aligned512(*val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u64x8(self, a: u64x8<Self>) -> [u64; 8usize] {
+        a.val.0
+    }
+    #[inline(always)]
+    fn as_array_ref_u64x8(self, a: &u64x8<Self>) -> &[u64; 8usize] {
+        &a.val.0
+    }
+    #[inline(always)]
+    fn as_array_mut_u64x8(self, a: &mut u64x8<Self>) -> &mut [u64; 8usize] {
+        &mut a.val.0
+    }
+    #[inline(always)]
+    fn store_array_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
+        *dest = a.val.0;
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u64x8(self, a: u8x64<Self>) -> u64x8<Self> {
+        u64x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u64x8<const SHIFT: usize>(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let mut dest = [Default::default(); 8usize];
+        dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]);
+        dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]);
+        dest.simd_into(self)
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u64x8<const SHIFT: usize>(
+        self,
+        a: u64x8<Self>,
+        b: u64x8<Self>,
+    ) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(
+            self.slide_within_blocks_u64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x4::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn and_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn or_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u64x8(self, a: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1))
+    }
+    #[inline(always)]
+    fn shl_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, _) = self.split_u64x8(a);
+        let (b0, _) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (_, a1) = self.split_u64x8(a);
+        let (_, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_lo = self.zip_low_u64x4(a0, b0);
+        let lo_hi = self.zip_high_u64x4(a0, b0);
+        let hi_lo = self.zip_low_u64x4(a1, b1);
+        let hi_hi = self.zip_high_u64x4(a1, b1);
+        (
+            self.combine_u64x4(lo_lo, lo_hi),
+            self.combine_u64x4(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_even = self.unzip_low_u64x4(a0, a1);
+        let lo_odd = self.unzip_high_u64x4(a0, a1);
+        let hi_even = self.unzip_low_u64x4(b0, b1);
+        let hi_odd = self.unzip_high_u64x4(b0, b1);
+        (
+            self.combine_u64x4(lo_even, hi_even),
+            self.combine_u64x4(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u64x8(self, a: mask64x8<Self>, b: u64x8<Self>, c: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let (c0, c1) = self.split_u64x8(c);
+        self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let mut b0 = [0; 4usize];
+        let mut b1 = [0; 4usize];
+        b0.copy_from_slice(&a.val.0[0..4usize]);
+        b1.copy_from_slice(&a.val.0[4usize..8usize]);
+        (b0.simd_into(self), b1.simd_into(self))
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self> {
+        [
+            src[0usize],
+            src[2usize],
+            src[4usize],
+            src[6usize],
+            src[1usize],
+            src[3usize],
+            src[5usize],
+            src[7usize],
+        ]
+        .simd_into(self)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
+        *dest = [
+            a[0usize], a[4usize], a[1usize], a[5usize], a[2usize], a[6usize], a[3usize], a[7usize],
+        ];
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x8(self, a: u64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u64x4(a0),
+            self.reinterpret_u32_u64x4(a1),
+        )
+    }
+    #[inline(always)]
     fn splat_mask64x8(self, val: bool) -> mask64x8<Self> {
         let half = self.splat_mask64x4(val);
         self.combine_mask64x4(half, half)
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index 8553ff661..656848614 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -6,9 +6,9 @@
 use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 use core::arch::aarch64::*;
 #[doc = "A token for Neon intrinsics on aarch64, representing the \"neon\" level."]
@@ -35,6 +35,8 @@ impl ArchTypes for Neon {
     type u32x4 = crate::support::Aligned128<uint32x4_t>;
     type mask32x4 = crate::support::Aligned128<int32x4_t>;
     type f64x2 = crate::support::Aligned128<float64x2_t>;
+    type i64x2 = crate::support::Aligned128<int64x2_t>;
+    type u64x2 = crate::support::Aligned128<uint64x2_t>;
     type mask64x2 = crate::support::Aligned128<int64x2_t>;
     type f32x8 = crate::support::Aligned256<float32x4x2_t>;
     type i8x32 = crate::support::Aligned256<int8x16x2_t>;
@@ -47,6 +49,8 @@ impl ArchTypes for Neon {
     type u32x8 = crate::support::Aligned256<uint32x4x2_t>;
     type mask32x8 = crate::support::Aligned256<int32x4x2_t>;
     type f64x4 = crate::support::Aligned256<float64x2x2_t>;
+    type i64x4 = crate::support::Aligned256<int64x2x2_t>;
+    type u64x4 = crate::support::Aligned256<uint64x2x2_t>;
     type mask64x4 = crate::support::Aligned256<int64x2x2_t>;
     type f32x16 = crate::support::Aligned512<float32x4x4_t>;
     type i8x64 = crate::support::Aligned512<int8x16x4_t>;
@@ -59,6 +63,8 @@ impl ArchTypes for Neon {
     type u32x16 = crate::support::Aligned512<uint32x4x4_t>;
     type mask32x16 = crate::support::Aligned512<int32x4x4_t>;
     type f64x8 = crate::support::Aligned512<float64x2x4_t>;
+    type i64x8 = crate::support::Aligned512<int64x2x4_t>;
+    type u64x8 = crate::support::Aligned512<uint64x2x4_t>;
     type mask64x8 = crate::support::Aligned512<int64x2x4_t>;
 }
 impl Simd for Neon {
@@ -70,6 +76,8 @@ impl Simd for Neon {
     type i16s = i16x8<Self>;
     type u32s = u32x4<Self>;
     type i32s = i32x4<Self>;
+    type u64s = u64x2<Self>;
+    type i64s = i64x2<Self>;
     type mask8s = mask8x16<Self>;
     type mask16s = mask16x8<Self>;
     type mask32s = mask32x4<Self>;
@@ -3705,434 +3713,1180 @@ impl Simd for Neon {
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+    fn splat_i64x2(self, val: i64) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, val: bool) -> mask64x2<Neon> {
-                let val: i64 = if val { !0 } else { 0 };
+            fn kernel(token: Neon, val: i64) -> i64x2<Neon> {
                 vdupq_n_s64(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
-        mask64x2 {
+    fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+    fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x2(self, a: i64x2<Self>) -> [i64; 2usize] {
         crate::transmute::checked_transmute_copy::<int64x2_t, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+    fn as_array_ref_i64x2(self, a: &i64x2<Self>) -> &[i64; 2usize] {
+        crate::transmute::checked_cast_ref::<int64x2_t, [i64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i64x2(self, a: &mut i64x2<Self>) -> &mut [i64; 2usize] {
+        crate::transmute::checked_cast_mut::<int64x2_t, [i64; 2usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i64x2(self, a: i64x2<Self>, dest: &mut [i64; 2usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x2(self, a: u8x16<Self>) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x2<const SHIFT: usize>(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_i64x2(a).val.0,
+            self.cvt_to_bytes_i64x2(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x2<const SHIFT: usize>(
+        self,
+        a: i64x2<Self>,
+        b: i64x2<Self>,
+    ) -> i64x2<Self> {
+        self.slide_i64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, bits: u64) -> mask64x2<Neon> {
-                let shifts =
-                    crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(&[63, 62]);
-                let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
-                let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
-                vreinterpretq_s64_u64(mask).simd_into(token)
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                vaddq_s64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, bits)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+    fn sub_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>) -> u64 {
-                let weights =
-                    crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(&[1, 2]);
-                let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
-                vaddvq_u64(bits)
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                vsubq_s64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 2usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            2usize
+    fn mul_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let result: [i64; 2usize] = [
+                    a[0usize].wrapping_mul(b[0usize]),
+                    a[1usize].wrapping_mul(b[1usize]),
+                ];
+                result.simd_into(token)
+            }
         );
-        let mut lanes = self.as_array_mask64x2(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x2(lanes);
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn and_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
                 vandq_s64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn or_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
                 vorrq_s64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn xor_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
                 veorq_s64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+    fn not_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>) -> mask64x2<Neon> {
+            fn kernel(token: Neon, a: i64x2<Neon>) -> i64x2<Neon> {
                 vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn select_mask64x2(
-        self,
-        a: mask64x2<Self>,
-        b: mask64x2<Self>,
-        c: mask64x2<Self>,
-    ) -> mask64x2<Self> {
+    fn shl_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, shift: u32) -> i64x2<Neon> {
+                vshlq_s64(a.into(), vdupq_n_s64(shift as i64)).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shlv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                vshlq_s64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn shr_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, shift: u32) -> i64x2<Neon> {
+                vshlq_s64(a.into(), vdupq_n_s64(-(shift as i64))).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
+    }
+    #[inline(always)]
+    fn shrv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                vshlq_s64(a.into(), vnegq_s64(b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_eq_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_lt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcltq_s64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_le_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcleq_s64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_ge_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcgeq_s64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn simd_gt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcgtq_s64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vzip1q_s64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vzip2q_s64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vuzp1q_s64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vuzp2q_s64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn select_i64x2(self, a: mask64x2<Self>, b: i64x2<Self>, c: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
             fn kernel(
                 token: Neon,
                 a: mask64x2<Neon>,
-                b: mask64x2<Neon>,
-                c: mask64x2<Neon>,
-            ) -> mask64x2<Neon> {
+                b: i64x2<Neon>,
+                c: i64x2<Neon>,
+            ) -> i64x2<Neon> {
                 vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token)
             }
         );
         kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn min_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
-                vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(token)
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+                result.simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn max_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
-                vmaxvq_u32(vreinterpretq_u32_s64(a.into())) != 0
+            fn kernel(token: Neon, a: i64x2<Neon>, b: i64x2<Neon>) -> i64x2<Neon> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn combine_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x4<Self> {
+        i64x4 {
+            val: crate::support::Aligned256(int64x2x2_t(a.val.0, b.val.0)),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn neg_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
-                vminvq_u32(vreinterpretq_u32_s64(a.into())) == 0xffffffff
+            fn kernel(token: Neon, a: i64x2<Neon>) -> i64x2<Neon> {
+                vnegq_s64(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn reinterpret_u8_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
-                vminvq_u32(vreinterpretq_u32_s64(a.into())) != 0xffffffff
+            fn kernel(token: Neon, a: i64x2<Neon>) -> u8x16<Neon> {
+                vreinterpretq_u8_s64(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn reinterpret_u32_i64x2(self, a: i64x2<Self>) -> u32x4<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
-                vmaxvq_u32(vreinterpretq_u32_s64(a.into())) == 0
+            fn kernel(token: Neon, a: i64x2<Neon>) -> u32x4<Neon> {
+                vreinterpretq_u32_s64(a.into()).simd_into(token)
             }
         );
         kernel(self, a)
     }
     #[inline(always)]
-    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
-        mask64x4 {
-            val: crate::support::Aligned256(int64x2x2_t(a.val.0, b.val.0)),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
-        let half = self.splat_f32x4(val);
-        self.combine_f32x4(half, half)
+    fn splat_u64x2(self, val: u64) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, val: u64) -> u64x2<Neon> {
+                vdupq_n_u64(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
+    fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
+    fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
-        crate::transmute::checked_transmute_copy::<float32x4x2_t, [f32; 8usize]>(&a.val.0)
+    fn as_array_u64x2(self, a: u64x2<Self>) -> [u64; 2usize] {
+        crate::transmute::checked_transmute_copy::<uint64x2_t, [u64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
-        crate::transmute::checked_cast_ref::<float32x4x2_t, [f32; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x2(self, a: &u64x2<Self>) -> &[u64; 2usize] {
+        crate::transmute::checked_cast_ref::<uint64x2_t, [u64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
-        crate::transmute::checked_cast_mut::<float32x4x2_t, [f32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x2(self, a: &mut u64x2<Self>) -> &mut [u64; 2usize] {
+        crate::transmute::checked_cast_mut::<uint64x2_t, [u64; 2usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
+    fn store_array_u64x2(self, a: u64x2<Self>, dest: &mut [u64; 2usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
-        f32x8 {
+    fn cvt_from_bytes_u64x2(self, a: u8x16<Self>) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_to_bytes_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        u8x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        if SHIFT >= 8usize {
+    fn slide_u64x2<const SHIFT: usize>(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        if SHIFT >= 2usize {
             return b;
         }
-        let result = {
-            let a_bytes = self.cvt_to_bytes_f32x8(a).val.0;
-            let b_bytes = self.cvt_to_bytes_f32x8(b).val.0;
-            let a_blocks = [a_bytes.0, a_bytes.1];
-            let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT * 4usize;
-            uint8x16x2_t(
-                {
-                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
-                        &a_blocks,
-                        &b_blocks,
-                        0,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-                {
-                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
-                        &a_blocks,
-                        &b_blocks,
-                        1,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-            )
-        };
-        self.cvt_from_bytes_f32x8(u8x32 {
-            val: crate::support::Aligned256(result),
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_u64x2(a).val.0,
+            self.cvt_to_bytes_u64x2(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_u64x2(u8x16 {
+            val: crate::support::Aligned128(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x2<const SHIFT: usize>(
         self,
-        a: f32x8<Self>,
-        b: f32x8<Self>,
-    ) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
+        a: u64x2<Self>,
+        b: u64x2<Self>,
+    ) -> u64x2<Self> {
+        self.slide_u64x2::<SHIFT>(a, b)
     }
     #[inline(always)]
-    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
+    fn add_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                vaddq_u64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
+    fn sub_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                vsubq_u64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(
-            self.approximate_recip_f32x4(a0),
-            self.approximate_recip_f32x4(a1),
-        )
+    fn mul_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let result: [u64; 2usize] = [
+                    a[0usize].wrapping_mul(b[0usize]),
+                    a[1usize].wrapping_mul(b[1usize]),
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
+    fn and_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                vandq_u64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
+    fn or_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                vorrq_u64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
+    fn xor_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                veorq_u64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
+    fn not_u64x2(self, a: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>) -> u64x2<Neon> {
+                vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a.into()))).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
+    fn shl_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, shift: u32) -> u64x2<Neon> {
+                vshlq_u64(a.into(), vdupq_n_s64(shift as i64)).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
+    fn shlv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                vshlq_u64(a.into(), vreinterpretq_s64_u64(b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
+    fn shr_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, shift: u32) -> u64x2<Neon> {
+                vshlq_u64(a.into(), vdupq_n_s64(-(shift as i64))).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
+    fn shrv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                vshlq_u64(a.into(), vnegq_s64(vreinterpretq_s64_u64(b.into()))).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
+    fn simd_eq_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vceqq_u64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
+    fn simd_lt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcltq_u64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, _) = self.split_f32x8(a);
-        let (b0, _) = self.split_f32x8(b);
-        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
+    fn simd_le_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcleq_u64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (_, a1) = self.split_f32x8(a);
-        let (_, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
+    fn simd_ge_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcgeq_u64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
+    fn simd_gt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vcgtq_u64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
+    fn zip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vzip1q_u64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let lo_lo = self.zip_low_f32x4(a0, b0);
-        let lo_hi = self.zip_high_f32x4(a0, b0);
-        let hi_lo = self.zip_low_f32x4(a1, b1);
-        let hi_hi = self.zip_high_f32x4(a1, b1);
-        (
-            self.combine_f32x4(lo_lo, lo_hi),
-            self.combine_f32x4(hi_lo, hi_hi),
-        )
+    fn zip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vzip2q_u64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let lo_even = self.unzip_low_f32x4(a0, a1);
-        let lo_odd = self.unzip_high_f32x4(a0, a1);
-        let hi_even = self.unzip_low_f32x4(b0, b1);
-        let hi_odd = self.unzip_high_f32x4(b0, b1);
-        (
-            self.combine_f32x4(lo_even, hi_even),
-            self.combine_f32x4(lo_odd, hi_odd),
-        )
+    fn unzip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vuzp1q_u64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
+    fn unzip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                let x = a.into();
+                let y = b.into();
+                vuzp2q_u64(x, y).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
+    fn interleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b))
     }
     #[inline(always)]
-    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.max_precise_f32x4(a0, b0),
-            self.max_precise_f32x4(a1, b1),
-        )
+    fn deinterleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b))
+    }
+    #[inline(always)]
+    fn select_u64x2(self, a: mask64x2<Self>, b: u64x2<Self>, c: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Neon,
+                a: mask64x2<Neon>,
+                b: u64x2<Neon>,
+                c: u64x2<Neon>,
+            ) -> u64x2<Neon> {
+                vbslq_u64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn max_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>, b: u64x2<Neon>) -> u64x2<Neon> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn combine_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x4<Self> {
+        u64x4 {
+            val: crate::support::Aligned256(uint64x2x2_t(a.val.0, b.val.0)),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>) -> u8x16<Neon> {
+                vreinterpretq_u8_u64(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x2(self, a: u64x2<Self>) -> u32x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u64x2<Neon>) -> u32x4<Neon> {
+                vreinterpretq_u32_u64(a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, val: bool) -> mask64x2<Neon> {
+                let val: i64 = if val { !0 } else { 0 };
+                vdupq_n_s64(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+        mask64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+        crate::transmute::checked_transmute_copy::<int64x2_t, [i64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, bits: u64) -> mask64x2<Neon> {
+                let shifts =
+                    crate::transmute::checked_transmute_copy::<[i64; 2], int64x2_t>(&[63, 62]);
+                let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
+                let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
+                vreinterpretq_s64_u64(mask).simd_into(token)
+            }
+        );
+        kernel(self, bits)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>) -> u64 {
+                let weights =
+                    crate::transmute::checked_transmute_copy::<[u64; 2], uint64x2_t>(&[1, 2]);
+                let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
+                vaddvq_u64(bits)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
+    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
+                vandq_s64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
+                vorrq_s64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
+                veorq_s64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn select_mask64x2(
+        self,
+        a: mask64x2<Self>,
+        b: mask64x2<Self>,
+        c: mask64x2<Self>,
+    ) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Neon,
+                a: mask64x2<Neon>,
+                b: mask64x2<Neon>,
+                c: mask64x2<Neon>,
+            ) -> mask64x2<Neon> {
+                vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>, b: mask64x2<Neon>) -> mask64x2<Neon> {
+                vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
+                vmaxvq_u32(vreinterpretq_u32_s64(a.into())) != 0
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
+                vminvq_u32(vreinterpretq_u32_s64(a.into())) == 0xffffffff
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
+                vminvq_u32(vreinterpretq_u32_s64(a.into())) != 0xffffffff
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: mask64x2<Neon>) -> bool {
+                vmaxvq_u32(vreinterpretq_u32_s64(a.into())) == 0
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: crate::support::Aligned256(int64x2x2_t(a.val.0, b.val.0)),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+        let half = self.splat_f32x4(val);
+        self.combine_f32x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
+        crate::transmute::checked_transmute_copy::<float32x4x2_t, [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
+        crate::transmute::checked_cast_ref::<float32x4x2_t, [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
+        crate::transmute::checked_cast_mut::<float32x4x2_t, [f32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = {
+            let a_bytes = self.cvt_to_bytes_f32x8(a).val.0;
+            let b_bytes = self.cvt_to_bytes_f32x8(b).val.0;
+            let a_blocks = [a_bytes.0, a_bytes.1];
+            let b_blocks = [b_bytes.0, b_bytes.1];
+            let shift_bytes = SHIFT * 4usize;
+            uint8x16x2_t(
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        0,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        1,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_f32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x8<const SHIFT: usize>(
+        self,
+        a: f32x8<Self>,
+        b: f32x8<Self>,
+    ) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
+    }
+    #[inline(always)]
+    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
+    }
+    #[inline(always)]
+    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, _) = self.split_f32x8(a);
+        let (b0, _) = self.split_f32x8(b);
+        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (_, a1) = self.split_f32x8(a);
+        let (_, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let lo_lo = self.zip_low_f32x4(a0, b0);
+        let lo_hi = self.zip_high_f32x4(a0, b0);
+        let hi_lo = self.zip_low_f32x4(a1, b1);
+        let hi_hi = self.zip_high_f32x4(a1, b1);
+        (
+            self.combine_f32x4(lo_lo, lo_hi),
+            self.combine_f32x4(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let lo_even = self.unzip_low_f32x4(a0, a1);
+        let lo_odd = self.unzip_high_f32x4(a0, a1);
+        let hi_even = self.unzip_low_f32x4(b0, b1);
+        let hi_odd = self.unzip_high_f32x4(b0, b1);
+        (
+            self.combine_f32x4(lo_even, hi_even),
+            self.combine_f32x4(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.max_precise_f32x4(a0, b0),
+            self.max_precise_f32x4(a1, b1),
+        )
     }
     #[inline(always)]
     fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
@@ -4144,197 +4898,1221 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
-    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(
-            self.mul_add_f32x4(a0, b0, c0),
-            self.mul_add_f32x4(a1, b1, c1),
+    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_add_f32x4(a0, b0, c0),
+            self.mul_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_sub_f32x4(a0, b0, c0),
+            self.mul_sub_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
+    }
+    #[inline(always)]
+    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.round_ties_even_f32x4(a0),
+            self.round_ties_even_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
+    }
+    #[inline(always)]
+    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
+    }
+    #[inline(always)]
+    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+        f32x16 {
+            val: crate::support::Aligned512(float32x4x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        (
+            f32x4 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            f32x4 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f64x2(
+            self.reinterpret_f64_f32x4(a0),
+            self.reinterpret_f64_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(
+            self.reinterpret_i32_f32x4(a0),
+            self.reinterpret_i32_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_f32x4(a0),
+            self.reinterpret_u32_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u32x4(
+            self.cvt_u32_precise_f32x4(a0),
+            self.cvt_u32_precise_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(
+            self.cvt_i32_precise_f32x4(a0),
+            self.cvt_i32_precise_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+        let half = self.splat_i8x16(val);
+        self.combine_i8x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<int8x16x2_t, [i8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
+        crate::transmute::checked_cast_ref::<int8x16x2_t, [i8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
+        crate::transmute::checked_cast_mut::<int8x16x2_t, [i8; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
+        i8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        if SHIFT >= 32usize {
+            return b;
+        }
+        let result = {
+            let a_bytes = self.cvt_to_bytes_i8x32(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i8x32(b).val.0;
+            let a_blocks = [a_bytes.0, a_bytes.1];
+            let b_blocks = [b_bytes.0, b_bytes.1];
+            let shift_bytes = SHIFT;
+            uint8x16x2_t(
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        0,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        1,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_i8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+        self,
+        a: i8x32<Self>,
+        b: i8x32<Self>,
+    ) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(
+            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
+    }
+    #[inline(always)]
+    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, _) = self.split_i8x32(a);
+        let (b0, _) = self.split_i8x32(b);
+        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (_, a1) = self.split_i8x32(a);
+        let (_, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        let lo_lo = self.zip_low_i8x16(a0, b0);
+        let lo_hi = self.zip_high_i8x16(a0, b0);
+        let hi_lo = self.zip_low_i8x16(a1, b1);
+        let hi_hi = self.zip_high_i8x16(a1, b1);
+        (
+            self.combine_i8x16(lo_lo, lo_hi),
+            self.combine_i8x16(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        let lo_even = self.unzip_low_i8x16(a0, a1);
+        let lo_odd = self.unzip_high_i8x16(a0, a1);
+        let hi_even = self.unzip_low_i8x16(b0, b1);
+        let hi_odd = self.unzip_high_i8x16(b0, b1);
+        (
+            self.combine_i8x16(lo_even, hi_even),
+            self.combine_i8x16(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        let (c0, c1) = self.split_i8x32(c);
+        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+        i8x64 {
+            val: crate::support::Aligned512(int8x16x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+        (
+            i8x16 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            i8x16 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_i8x16(a0),
+            self.reinterpret_u32_i8x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+        let half = self.splat_u8x16(val);
+        self.combine_u8x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
+        crate::transmute::checked_transmute_copy::<uint8x16x2_t, [u8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
+        crate::transmute::checked_cast_ref::<uint8x16x2_t, [u8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
+        crate::transmute::checked_cast_mut::<uint8x16x2_t, [u8; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        if SHIFT >= 32usize {
+            return b;
+        }
+        let result = {
+            let a_bytes = self.cvt_to_bytes_u8x32(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u8x32(b).val.0;
+            let a_blocks = [a_bytes.0, a_bytes.1];
+            let b_blocks = [b_bytes.0, b_bytes.1];
+            let shift_bytes = SHIFT;
+            uint8x16x2_t(
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        0,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        1,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+        self,
+        a: u8x32<Self>,
+        b: u8x32<Self>,
+    ) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(
+            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
+    }
+    #[inline(always)]
+    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, _) = self.split_u8x32(a);
+        let (b0, _) = self.split_u8x32(b);
+        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (_, a1) = self.split_u8x32(a);
+        let (_, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let lo_lo = self.zip_low_u8x16(a0, b0);
+        let lo_hi = self.zip_high_u8x16(a0, b0);
+        let hi_lo = self.zip_low_u8x16(a1, b1);
+        let hi_hi = self.zip_high_u8x16(a1, b1);
+        (
+            self.combine_u8x16(lo_lo, lo_hi),
+            self.combine_u8x16(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let lo_even = self.unzip_low_u8x16(a0, a1);
+        let lo_odd = self.unzip_high_u8x16(a0, a1);
+        let hi_even = self.unzip_low_u8x16(b0, b1);
+        let hi_odd = self.unzip_high_u8x16(b0, b1);
+        (
+            self.combine_u8x16(lo_even, hi_even),
+            self.combine_u8x16(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let (c0, c1) = self.split_u8x32(c);
+        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::support::Aligned512(uint8x16x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        (
+            u8x16 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            u8x16 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u8x16(a0),
+            self.reinterpret_u32_u8x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
+        let half = self.splat_mask8x16(val);
+        self.combine_mask8x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+        mask8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<int8x16x2_t, [i8; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        let lo = self.from_bitmask_mask8x16(bits);
+        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
+        self.combine_mask8x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x32(a);
+        let lo = self.to_bitmask_mask8x16(lo);
+        let hi = self.to_bitmask_mask8x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
+    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
+    }
+    #[inline(always)]
+    fn select_mask8x32(
+        self,
+        a: mask8x32<Self>,
+        b: mask8x32<Self>,
+        c: mask8x32<Self>,
+    ) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        let (c0, c1) = self.split_mask8x32(c);
+        self.combine_mask8x16(
+            self.select_mask8x16(a0, b0, c0),
+            self.select_mask8x16(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
+    }
+    #[inline(always)]
+    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
+        mask8x64 {
+            val: crate::support::Aligned512(int8x16x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
+        (
+            mask8x16 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            mask8x16 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+        let half = self.splat_i16x8(val);
+        self.combine_i16x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<int16x8x2_t, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
+        crate::transmute::checked_cast_ref::<int16x8x2_t, [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
+        crate::transmute::checked_cast_mut::<int16x8x2_t, [i16; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = {
+            let a_bytes = self.cvt_to_bytes_i16x16(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i16x16(b).val.0;
+            let a_blocks = [a_bytes.0, a_bytes.1];
+            let b_blocks = [b_bytes.0, b_bytes.1];
+            let shift_bytes = SHIFT * 2usize;
+            uint8x16x2_t(
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        0,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        1,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_i16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x16<const SHIFT: usize>(
+        self,
+        a: i16x16<Self>,
+        b: i16x16<Self>,
+    ) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(
+            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(
-            self.mul_sub_f32x4(a0, b0, c0),
-            self.mul_sub_f32x4(a1, b1, c1),
-        )
+    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
+    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
+    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(
-            self.round_ties_even_f32x4(a0),
-            self.round_ties_even_f32x4(a1),
-        )
+    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
+    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
+    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
+    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
     }
     #[inline(always)]
-    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
-        f32x16 {
-            val: crate::support::Aligned512(float32x4x4_t(
-                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
-            )),
-            simd: self,
-        }
+    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
     }
     #[inline(always)]
-    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, _) = self.split_i16x16(a);
+        let (b0, _) = self.split_i16x16(b);
+        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (_, a1) = self.split_i16x16(a);
+        let (_, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        let lo_lo = self.zip_low_i16x8(a0, b0);
+        let lo_hi = self.zip_high_i16x8(a0, b0);
+        let hi_lo = self.zip_low_i16x8(a1, b1);
+        let hi_hi = self.zip_high_i16x8(a1, b1);
         (
-            f32x4 {
-                val: crate::support::Aligned128(a.val.0.0),
-                simd: self,
-            },
-            f32x4 {
-                val: crate::support::Aligned128(a.val.0.1),
-                simd: self,
-            },
+            self.combine_i16x8(lo_lo, lo_hi),
+            self.combine_i16x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f64x2(
-            self.reinterpret_f64_f32x4(a0),
-            self.reinterpret_f64_f32x4(a1),
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        let lo_even = self.unzip_low_i16x8(a0, a1);
+        let lo_odd = self.unzip_high_i16x8(a0, a1);
+        let hi_even = self.unzip_low_i16x8(b0, b1);
+        let hi_odd = self.unzip_high_i16x8(b0, b1);
+        (
+            self.combine_i16x8(lo_even, hi_even),
+            self.combine_i16x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(
-            self.reinterpret_i32_f32x4(a0),
-            self.reinterpret_i32_f32x4(a1),
-        )
+    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        let (c0, c1) = self.split_i16x16(c);
+        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
+    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_f32x4(a0),
-            self.reinterpret_u32_f32x4(a1),
-        )
+    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
+    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+        i16x32 {
+            val: crate::support::Aligned512(int16x8x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(
-            self.cvt_u32_precise_f32x4(a0),
-            self.cvt_u32_precise_f32x4(a1),
+    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+        (
+            i16x8 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            i16x8 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
+    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(
-            self.cvt_i32_precise_f32x4(a0),
-            self.cvt_i32_precise_f32x4(a1),
+    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_i16x8(a0),
+            self.reinterpret_u32_i16x8(a1),
         )
     }
     #[inline(always)]
-    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
-        let half = self.splat_i8x16(val);
-        self.combine_i8x16(half, half)
+    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+        let half = self.splat_u16x8(val);
+        self.combine_u16x8(half, half)
     }
     #[inline(always)]
-    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
+    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
+    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<int8x16x2_t, [i8; 32usize]>(&a.val.0)
+    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
+        crate::transmute::checked_transmute_copy::<uint16x8x2_t, [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
-        crate::transmute::checked_cast_ref::<int8x16x2_t, [i8; 32usize]>(&a.val.0)
+    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
+        crate::transmute::checked_cast_ref::<uint16x8x2_t, [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
-        crate::transmute::checked_cast_mut::<int8x16x2_t, [i8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
+        crate::transmute::checked_cast_mut::<uint16x8x2_t, [u16; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
-        i8x32 {
+    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_i8x32(a).val.0;
-            let b_bytes = self.cvt_to_bytes_i8x32(b).val.0;
+            let a_bytes = self.cvt_to_bytes_u16x16(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u16x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
             let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT;
+            let shift_bytes = SHIFT * 2usize;
             uint8x16x2_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -4356,286 +6134,420 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_i8x32(u8x32 {
+        self.cvt_from_bytes_u16x16(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+    fn slide_within_blocks_u16x16<const SHIFT: usize>(
         self,
-        a: i8x32<Self>,
-        b: i8x32<Self>,
-    ) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(
-            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
+        a: u16x16<Self>,
+        b: u16x16<Self>,
+    ) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(
+            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
+    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
+    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
+    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
+    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
+    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
+    }
+    #[inline(always)]
+    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, _) = self.split_u16x16(a);
+        let (b0, _) = self.split_u16x16(b);
+        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (_, a1) = self.split_u16x16(a);
+        let (_, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let lo_lo = self.zip_low_u16x8(a0, b0);
+        let lo_hi = self.zip_high_u16x8(a0, b0);
+        let hi_lo = self.zip_low_u16x8(a1, b1);
+        let hi_hi = self.zip_high_u16x8(a1, b1);
+        (
+            self.combine_u16x8(lo_lo, lo_hi),
+            self.combine_u16x8(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let lo_even = self.unzip_low_u16x8(a0, a1);
+        let lo_odd = self.unzip_high_u16x8(a0, a1);
+        let hi_even = self.unzip_low_u16x8(b0, b1);
+        let hi_odd = self.unzip_high_u16x8(b0, b1);
+        (
+            self.combine_u16x8(lo_even, hi_even),
+            self.combine_u16x8(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let (c0, c1) = self.split_u16x16(c);
+        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
+    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+        u16x32 {
+            val: crate::support::Aligned512(uint16x8x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
+    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+        (
+            u16x8 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            u16x8 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
+    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Neon, a: u16x16<Neon>) -> u8x16<Neon> {
+                let converted: uint16x8x2_t = a.into();
+                let low = vmovn_u16(converted.0);
+                let high = vmovn_u16(converted.1);
+                vcombine_u8(low, high).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
+    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
     }
     #[inline(always)]
-    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
+    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u16x8(a0),
+            self.reinterpret_u32_u16x8(a1),
+        )
     }
     #[inline(always)]
-    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
+    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
+        let half = self.splat_mask16x8(val);
+        self.combine_mask16x8(half, half)
     }
     #[inline(always)]
-    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
+    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
+        mask16x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
+    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<int16x8x2_t, [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        let lo = self.from_bitmask_mask16x8(bits);
+        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
+        self.combine_mask16x8(lo, hi)
     }
     #[inline(always)]
-    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x16(a);
+        let lo = self.to_bitmask_mask16x8(lo);
+        let hi = self.to_bitmask_mask16x8(hi);
+        lo | (hi << 8usize)
     }
     #[inline(always)]
-    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
     }
     #[inline(always)]
-    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, _) = self.split_i8x32(a);
-        let (b0, _) = self.split_i8x32(b);
-        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
+    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (_, a1) = self.split_i8x32(a);
-        let (_, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
+    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
+    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
+    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
     }
     #[inline(always)]
-    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let lo_lo = self.zip_low_i8x16(a0, b0);
-        let lo_hi = self.zip_high_i8x16(a0, b0);
-        let hi_lo = self.zip_low_i8x16(a1, b1);
-        let hi_hi = self.zip_high_i8x16(a1, b1);
-        (
-            self.combine_i8x16(lo_lo, lo_hi),
-            self.combine_i8x16(hi_lo, hi_hi),
+    fn select_mask16x16(
+        self,
+        a: mask16x16<Self>,
+        b: mask16x16<Self>,
+        c: mask16x16<Self>,
+    ) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        let (c0, c1) = self.split_mask16x16(c);
+        self.combine_mask16x8(
+            self.select_mask16x8(a0, b0, c0),
+            self.select_mask16x8(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let lo_even = self.unzip_low_i8x16(a0, a1);
-        let lo_odd = self.unzip_high_i8x16(a0, a1);
-        let hi_even = self.unzip_low_i8x16(b0, b1);
-        let hi_odd = self.unzip_high_i8x16(b0, b1);
-        (
-            self.combine_i8x16(lo_even, hi_even),
-            self.combine_i8x16(lo_odd, hi_odd),
-        )
+    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
     }
     #[inline(always)]
-    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let (c0, c1) = self.split_i8x32(c);
-        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
+    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
     }
     #[inline(always)]
-    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
+    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
     }
     #[inline(always)]
-    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
+    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
     }
     #[inline(always)]
-    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
-        i8x64 {
-            val: crate::support::Aligned512(int8x16x4_t(
+    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: crate::support::Aligned512(int16x8x4_t(
                 a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
             )),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
         (
-            i8x16 {
+            mask16x8 {
                 val: crate::support::Aligned128(a.val.0.0),
                 simd: self,
             },
-            i8x16 {
+            mask16x8 {
                 val: crate::support::Aligned128(a.val.0.1),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_i8x16(a0),
-            self.reinterpret_u32_i8x16(a1),
-        )
-    }
-    #[inline(always)]
-    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
-        let half = self.splat_u8x16(val);
-        self.combine_u8x16(half, half)
+    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+        let half = self.splat_i32x4(val);
+        self.combine_i32x4(half, half)
     }
     #[inline(always)]
-    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
-        crate::transmute::checked_transmute_copy::<uint8x16x2_t, [u8; 32usize]>(&a.val.0)
+    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<int32x4x2_t, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
-        crate::transmute::checked_cast_ref::<uint8x16x2_t, [u8; 32usize]>(&a.val.0)
+    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
+        crate::transmute::checked_cast_ref::<int32x4x2_t, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
-        crate::transmute::checked_cast_mut::<uint8x16x2_t, [u8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
+        crate::transmute::checked_cast_mut::<int32x4x2_t, [i32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_u8x32(a).val.0;
-            let b_bytes = self.cvt_to_bytes_u8x32(b).val.0;
+            let a_bytes = self.cvt_to_bytes_i32x8(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i32x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
             let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT;
+            let shift_bytes = SHIFT * 4usize;
             uint8x16x2_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -4657,708 +6569,710 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_u8x32(u8x32 {
+        self.cvt_from_bytes_i32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+    fn slide_within_blocks_i32x8<const SHIFT: usize>(
         self,
-        a: u8x32<Self>,
-        b: u8x32<Self>,
-    ) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(
-            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
+        a: i32x8<Self>,
+        b: i32x8<Self>,
+    ) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(
+            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
+    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
+    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
+    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
+    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
+    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
+    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
+    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
     }
     #[inline(always)]
-    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
+    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
+    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
+    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
+    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
+    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
+    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
+    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
+    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
+    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, _) = self.split_u8x32(a);
-        let (b0, _) = self.split_u8x32(b);
-        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, _) = self.split_i32x8(a);
+        let (b0, _) = self.split_i32x8(b);
+        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (_, a1) = self.split_u8x32(a);
-        let (_, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
+    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (_, a1) = self.split_i32x8(a);
+        let (_, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
+    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
+    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let lo_lo = self.zip_low_u8x16(a0, b0);
-        let lo_hi = self.zip_high_u8x16(a0, b0);
-        let hi_lo = self.zip_low_u8x16(a1, b1);
-        let hi_hi = self.zip_high_u8x16(a1, b1);
+    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        let lo_lo = self.zip_low_i32x4(a0, b0);
+        let lo_hi = self.zip_high_i32x4(a0, b0);
+        let hi_lo = self.zip_low_i32x4(a1, b1);
+        let hi_hi = self.zip_high_i32x4(a1, b1);
         (
-            self.combine_u8x16(lo_lo, lo_hi),
-            self.combine_u8x16(hi_lo, hi_hi),
+            self.combine_i32x4(lo_lo, lo_hi),
+            self.combine_i32x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let lo_even = self.unzip_low_u8x16(a0, a1);
-        let lo_odd = self.unzip_high_u8x16(a0, a1);
-        let hi_even = self.unzip_low_u8x16(b0, b1);
-        let hi_odd = self.unzip_high_u8x16(b0, b1);
+    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        let lo_even = self.unzip_low_i32x4(a0, a1);
+        let lo_odd = self.unzip_high_i32x4(a0, a1);
+        let hi_even = self.unzip_low_i32x4(b0, b1);
+        let hi_odd = self.unzip_high_i32x4(b0, b1);
         (
-            self.combine_u8x16(lo_even, hi_even),
-            self.combine_u8x16(lo_odd, hi_odd),
+            self.combine_i32x4(lo_even, hi_even),
+            self.combine_i32x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let (c0, c1) = self.split_u8x32(c);
-        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
+    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        let (c0, c1) = self.split_i32x8(c);
+        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
+    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
+    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
-        u8x64 {
-            val: crate::support::Aligned512(uint8x16x4_t(
+    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
+        i32x16 {
+            val: crate::support::Aligned512(int32x4x4_t(
                 a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
             )),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
         (
-            u8x16 {
+            i32x4 {
                 val: crate::support::Aligned128(a.val.0.0),
                 simd: self,
             },
-            u8x16 {
+            i32x4 {
                 val: crate::support::Aligned128(a.val.0.1),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
+    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u8x32(a);
+    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
         self.combine_u32x4(
-            self.reinterpret_u32_u8x16(a0),
-            self.reinterpret_u32_u8x16(a1),
+            self.reinterpret_u32_i32x4(a0),
+            self.reinterpret_u32_i32x4(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
-        let half = self.splat_mask8x16(val);
-        self.combine_mask8x16(half, half)
+    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
     }
     #[inline(always)]
-    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
-        mask8x32 {
+    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+        let half = self.splat_u32x4(val);
+        self.combine_u32x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<int8x16x2_t, [i8; 32usize]>(&a.val.0)
+    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
-        let lo = self.from_bitmask_mask8x16(bits);
-        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
-        self.combine_mask8x16(lo, hi)
+    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
+        crate::transmute::checked_transmute_copy::<uint32x4x2_t, [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
-        let (lo, hi) = self.split_mask8x32(a);
-        let lo = self.to_bitmask_mask8x16(lo);
-        let hi = self.to_bitmask_mask8x16(hi);
-        lo | (hi << 16usize)
+    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
+        crate::transmute::checked_cast_ref::<uint32x4x2_t, [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 32usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32usize
-        );
-        let mut lanes = self.as_array_mask8x32(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x32(lanes);
+    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
+        crate::transmute::checked_cast_mut::<uint32x4x2_t, [u32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
+    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
+    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
+    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
+    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = {
+            let a_bytes = self.cvt_to_bytes_u32x8(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u32x8(b).val.0;
+            let a_blocks = [a_bytes.0, a_bytes.1];
+            let b_blocks = [b_bytes.0, b_bytes.1];
+            let shift_bytes = SHIFT * 4usize;
+            uint8x16x2_t(
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        0,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        1,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask8x32(
+    fn slide_within_blocks_u32x8<const SHIFT: usize>(
         self,
-        a: mask8x32<Self>,
-        b: mask8x32<Self>,
-        c: mask8x32<Self>,
-    ) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        let (c0, c1) = self.split_mask8x32(c);
-        self.combine_mask8x16(
-            self.select_mask8x16(a0, b0, c0),
-            self.select_mask8x16(a1, b1, c1),
+        a: u32x8<Self>,
+        b: u32x8<Self>,
+    ) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(
+            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
+    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
+    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
+    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
+    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
+    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: crate::support::Aligned512(int8x16x4_t(
-                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
-            )),
-            simd: self,
-        }
+    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
-        (
-            mask8x16 {
-                val: crate::support::Aligned128(a.val.0.0),
-                simd: self,
-            },
-            mask8x16 {
-                val: crate::support::Aligned128(a.val.0.1),
-                simd: self,
-            },
-        )
+    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
     }
     #[inline(always)]
-    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
-        let half = self.splat_i16x8(val);
-        self.combine_i16x8(half, half)
+    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
     }
     #[inline(always)]
-    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<int16x8x2_t, [i16; 16usize]>(&a.val.0)
+    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
-        crate::transmute::checked_cast_ref::<int16x8x2_t, [i16; 16usize]>(&a.val.0)
+    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
-        crate::transmute::checked_cast_mut::<int16x8x2_t, [i16; 16usize]>(&mut a.val.0)
+    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, _) = self.split_u32x8(a);
+        let (b0, _) = self.split_u32x8(b);
+        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (_, a1) = self.split_u32x8(a);
+        let (_, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = {
-            let a_bytes = self.cvt_to_bytes_i16x16(a).val.0;
-            let b_bytes = self.cvt_to_bytes_i16x16(b).val.0;
-            let a_blocks = [a_bytes.0, a_bytes.1];
-            let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT * 2usize;
-            uint8x16x2_t(
-                {
-                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
-                        &a_blocks,
-                        &b_blocks,
-                        0,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-                {
-                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
-                        &a_blocks,
-                        &b_blocks,
-                        1,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-            )
-        };
-        self.cvt_from_bytes_i16x16(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x16<const SHIFT: usize>(
-        self,
-        a: i16x16<Self>,
-        b: i16x16<Self>,
-    ) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(
-            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
-        )
+    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
     }
     #[inline(always)]
-    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
+    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let lo_lo = self.zip_low_u32x4(a0, b0);
+        let lo_hi = self.zip_high_u32x4(a0, b0);
+        let hi_lo = self.zip_low_u32x4(a1, b1);
+        let hi_hi = self.zip_high_u32x4(a1, b1);
+        (
+            self.combine_u32x4(lo_lo, lo_hi),
+            self.combine_u32x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
+    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let lo_even = self.unzip_low_u32x4(a0, a1);
+        let lo_odd = self.unzip_high_u32x4(a0, a1);
+        let hi_even = self.unzip_low_u32x4(b0, b1);
+        let hi_odd = self.unzip_high_u32x4(b0, b1);
+        (
+            self.combine_u32x4(lo_even, hi_even),
+            self.combine_u32x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
+    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let (c0, c1) = self.split_u32x8(c);
+        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
+    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
+    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
+    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
+        u32x16 {
+            val: crate::support::Aligned512(uint32x4x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
+    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        (
+            u32x4 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            u32x4 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
+    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
     }
     #[inline(always)]
-    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
+    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
     }
     #[inline(always)]
-    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
+    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
+        let half = self.splat_mask32x4(val);
+        self.combine_mask32x4(half, half)
     }
     #[inline(always)]
-    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
+    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
+        mask32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
+    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<int32x4x2_t, [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        let lo = self.from_bitmask_mask32x4(bits);
+        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
+        self.combine_mask32x4(lo, hi)
     }
     #[inline(always)]
-    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x8(a);
+        let lo = self.to_bitmask_mask32x4(lo);
+        let hi = self.to_bitmask_mask32x4(hi);
+        lo | (hi << 4usize)
     }
     #[inline(always)]
-    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
     }
     #[inline(always)]
-    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
+    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, _) = self.split_i16x16(a);
-        let (b0, _) = self.split_i16x16(b);
-        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
+    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (_, a1) = self.split_i16x16(a);
-        let (_, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
+    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
+    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
     }
     #[inline(always)]
-    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
+    fn select_mask32x8(
+        self,
+        a: mask32x8<Self>,
+        b: mask32x8<Self>,
+        c: mask32x8<Self>,
+    ) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        let (c0, c1) = self.split_mask32x8(c);
+        self.combine_mask32x4(
+            self.select_mask32x4(a0, b0, c0),
+            self.select_mask32x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let lo_lo = self.zip_low_i16x8(a0, b0);
-        let lo_hi = self.zip_high_i16x8(a0, b0);
-        let hi_lo = self.zip_low_i16x8(a1, b1);
-        let hi_hi = self.zip_high_i16x8(a1, b1);
-        (
-            self.combine_i16x8(lo_lo, lo_hi),
-            self.combine_i16x8(hi_lo, hi_hi),
-        )
+    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let lo_even = self.unzip_low_i16x8(a0, a1);
-        let lo_odd = self.unzip_high_i16x8(a0, a1);
-        let hi_even = self.unzip_low_i16x8(b0, b1);
-        let hi_odd = self.unzip_high_i16x8(b0, b1);
-        (
-            self.combine_i16x8(lo_even, hi_even),
-            self.combine_i16x8(lo_odd, hi_odd),
-        )
+    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
     }
     #[inline(always)]
-    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let (c0, c1) = self.split_i16x16(c);
-        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
+    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
     }
     #[inline(always)]
-    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
+    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
     }
     #[inline(always)]
-    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
+    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
     }
     #[inline(always)]
-    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
-        i16x32 {
-            val: crate::support::Aligned512(int16x8x4_t(
+    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
+        mask32x16 {
+            val: crate::support::Aligned512(int32x4x4_t(
                 a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
             )),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
         (
-            i16x8 {
+            mask32x4 {
                 val: crate::support::Aligned128(a.val.0.0),
                 simd: self,
             },
-            i16x8 {
+            mask32x4 {
                 val: crate::support::Aligned128(a.val.0.1),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_i16x8(a0),
-            self.reinterpret_u32_i16x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
-        let half = self.splat_u16x8(val);
-        self.combine_u16x8(half, half)
+    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
+        let half = self.splat_f64x2(val);
+        self.combine_f64x2(half, half)
     }
     #[inline(always)]
-    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
-        crate::transmute::checked_transmute_copy::<uint16x8x2_t, [u16; 16usize]>(&a.val.0)
+    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
+        crate::transmute::checked_transmute_copy::<float64x2x2_t, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
-        crate::transmute::checked_cast_ref::<uint16x8x2_t, [u16; 16usize]>(&a.val.0)
+    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
+        crate::transmute::checked_cast_ref::<float64x2x2_t, [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
-        crate::transmute::checked_cast_mut::<uint16x8x2_t, [u16; 16usize]>(&mut a.val.0)
+    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
+        crate::transmute::checked_cast_mut::<float64x2x2_t, [f64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
-        u16x16 {
+    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        if SHIFT >= 4usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_u16x16(a).val.0;
-            let b_bytes = self.cvt_to_bytes_u16x16(b).val.0;
+            let a_bytes = self.cvt_to_bytes_f64x4(a).val.0;
+            let b_bytes = self.cvt_to_bytes_f64x4(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
             let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT * 2usize;
+            let shift_bytes = SHIFT * 8usize;
             uint8x16x2_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -5374,426 +7288,338 @@ impl Simd for Neon {
                         &a_blocks,
                         &b_blocks,
                         1,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-            )
-        };
-        self.cvt_from_bytes_u16x16(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn slide_within_blocks_u16x16<const SHIFT: usize>(
-        self,
-        a: u16x16<Self>,
-        b: u16x16<Self>,
-    ) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(
-            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
-    }
-    #[inline(always)]
-    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
+    fn slide_within_blocks_f64x4<const SHIFT: usize>(
+        self,
+        a: f64x4<Self>,
+        b: f64x4<Self>,
+    ) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
+        )
     }
     #[inline(always)]
-    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
+    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
+    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
     }
     #[inline(always)]
-    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, _) = self.split_u16x16(a);
-        let (b0, _) = self.split_u16x16(b);
-        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
+    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
-    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (_, a1) = self.split_u16x16(a);
-        let (_, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
+    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
+    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let lo_lo = self.zip_low_u16x8(a0, b0);
-        let lo_hi = self.zip_high_u16x8(a0, b0);
-        let hi_lo = self.zip_low_u16x8(a1, b1);
-        let hi_hi = self.zip_high_u16x8(a1, b1);
-        (
-            self.combine_u16x8(lo_lo, lo_hi),
-            self.combine_u16x8(hi_lo, hi_hi),
-        )
+    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let lo_even = self.unzip_low_u16x8(a0, a1);
-        let lo_odd = self.unzip_high_u16x8(a0, a1);
-        let hi_even = self.unzip_low_u16x8(b0, b1);
-        let hi_odd = self.unzip_high_u16x8(b0, b1);
-        (
-            self.combine_u16x8(lo_even, hi_even),
-            self.combine_u16x8(lo_odd, hi_odd),
-        )
+    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let (c0, c1) = self.split_u16x16(c);
-        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
+    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
+    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
+    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
-        u16x32 {
-            val: crate::support::Aligned512(uint16x8x4_t(
-                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
-            )),
-            simd: self,
-        }
+    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
-        (
-            u16x8 {
-                val: crate::support::Aligned128(a.val.0.0),
-                simd: self,
-            },
-            u16x8 {
-                val: crate::support::Aligned128(a.val.0.1),
-                simd: self,
-            },
-        )
+    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Neon, a: u16x16<Neon>) -> u8x16<Neon> {
-                let converted: uint16x8x2_t = a.into();
-                let low = vmovn_u16(converted.0);
-                let high = vmovn_u16(converted.1);
-                vcombine_u8(low, high).simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
+    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, _) = self.split_f64x4(a);
+        let (b0, _) = self.split_f64x4(b);
+        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_u16x8(a0),
-            self.reinterpret_u32_u16x8(a1),
-        )
+    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (_, a1) = self.split_f64x4(a);
+        let (_, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
-        let half = self.splat_mask16x8(val);
-        self.combine_mask16x8(half, half)
+    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
     }
     #[inline(always)]
-    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
-        mask16x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
     }
     #[inline(always)]
-    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<int16x8x2_t, [i16; 16usize]>(&a.val.0)
+    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let lo_lo = self.zip_low_f64x2(a0, b0);
+        let lo_hi = self.zip_high_f64x2(a0, b0);
+        let hi_lo = self.zip_low_f64x2(a1, b1);
+        let hi_hi = self.zip_high_f64x2(a1, b1);
+        (
+            self.combine_f64x2(lo_lo, lo_hi),
+            self.combine_f64x2(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
-        let lo = self.from_bitmask_mask16x8(bits);
-        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
-        self.combine_mask16x8(lo, hi)
+    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let lo_even = self.unzip_low_f64x2(a0, a1);
+        let lo_odd = self.unzip_high_f64x2(a0, a1);
+        let hi_even = self.unzip_low_f64x2(b0, b1);
+        let hi_odd = self.unzip_high_f64x2(b0, b1);
+        (
+            self.combine_f64x2(lo_even, hi_even),
+            self.combine_f64x2(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
-        let (lo, hi) = self.split_mask16x16(a);
-        let lo = self.to_bitmask_mask16x8(lo);
-        let hi = self.to_bitmask_mask16x8(hi);
-        lo | (hi << 8usize)
+    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let mut lanes = self.as_array_mask16x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x16(lanes);
+    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
+    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.max_precise_f64x2(a0, b0),
+            self.max_precise_f64x2(a1, b1),
+        )
     }
     #[inline(always)]
-    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
+    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.min_precise_f64x2(a0, b0),
+            self.min_precise_f64x2(a1, b1),
+        )
     }
     #[inline(always)]
-    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
+    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_add_f64x2(a0, b0, c0),
+            self.mul_add_f64x2(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
+    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_sub_f64x2(a0, b0, c0),
+            self.mul_sub_f64x2(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn select_mask16x16(
-        self,
-        a: mask16x16<Self>,
-        b: mask16x16<Self>,
-        c: mask16x16<Self>,
-    ) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        let (c0, c1) = self.split_mask16x16(c);
-        self.combine_mask16x8(
-            self.select_mask16x8(a0, b0, c0),
-            self.select_mask16x8(a1, b1, c1),
-        )
+    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
+    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
     }
     #[inline(always)]
-    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
+    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.round_ties_even_f64x2(a0),
+            self.round_ties_even_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
+    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
     }
     #[inline(always)]
-    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
+    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
     }
     #[inline(always)]
-    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
+    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
     }
     #[inline(always)]
-    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: crate::support::Aligned512(int16x8x4_t(
+    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
+        f64x8 {
+            val: crate::support::Aligned512(float64x2x4_t(
                 a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
             )),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
+    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
         (
-            mask16x8 {
+            f64x2 {
                 val: crate::support::Aligned128(a.val.0.0),
                 simd: self,
             },
-            mask16x8 {
+            f64x2 {
                 val: crate::support::Aligned128(a.val.0.1),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
-        let half = self.splat_i32x4(val);
-        self.combine_i32x4(half, half)
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f32x4(
+            self.reinterpret_f32_f64x2(a0),
+            self.reinterpret_f32_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
+    fn splat_i64x4(self, val: i64) -> i64x4<Self> {
+        let half = self.splat_i64x2(val);
+        self.combine_i64x2(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
+    fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<int32x4x2_t, [i32; 8usize]>(&a.val.0)
+    fn as_array_i64x4(self, a: i64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<int64x2x2_t, [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
-        crate::transmute::checked_cast_ref::<int32x4x2_t, [i32; 8usize]>(&a.val.0)
+    fn as_array_ref_i64x4(self, a: &i64x4<Self>) -> &[i64; 4usize] {
+        crate::transmute::checked_cast_ref::<int64x2x2_t, [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
-        crate::transmute::checked_cast_mut::<int32x4x2_t, [i32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_i64x4(self, a: &mut i64x4<Self>) -> &mut [i64; 4usize] {
+        crate::transmute::checked_cast_mut::<int64x2x2_t, [i64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
+    fn store_array_i64x4(self, a: i64x4<Self>, dest: &mut [i64; 4usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
-        i32x8 {
+    fn cvt_from_bytes_i64x4(self, a: u8x32<Self>) -> i64x4<Self> {
+        i64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        if SHIFT >= 8usize {
+    fn slide_i64x4<const SHIFT: usize>(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        if SHIFT >= 4usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_i32x8(a).val.0;
-            let b_bytes = self.cvt_to_bytes_i32x8(b).val.0;
+            let a_bytes = self.cvt_to_bytes_i64x4(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i64x4(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
             let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT * 4usize;
+            let shift_bytes = SHIFT * 8usize;
             uint8x16x2_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -5815,291 +7641,286 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_i32x8(u8x32 {
+        self.cvt_from_bytes_i64x4(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x8<const SHIFT: usize>(
+    fn slide_within_blocks_i64x4<const SHIFT: usize>(
         self,
-        a: i32x8<Self>,
-        b: i32x8<Self>,
-    ) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(
-            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
+        a: i64x4<Self>,
+        b: i64x4<Self>,
+    ) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(
+            self.slide_within_blocks_i64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x2::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
+    fn add_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
+    fn sub_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
+    fn mul_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
+    fn and_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
+    fn or_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
+    fn xor_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
+    fn not_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1))
     }
     #[inline(always)]
-    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
+    fn shl_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
+    fn shlv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
+    fn shr_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
+    fn shrv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
+    fn simd_eq_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
+    fn simd_lt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
+    fn simd_le_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
+    fn simd_ge_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
+    fn simd_gt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, _) = self.split_i32x8(a);
-        let (b0, _) = self.split_i32x8(b);
-        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
+    fn zip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, _) = self.split_i64x4(a);
+        let (b0, _) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (_, a1) = self.split_i32x8(a);
-        let (_, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
+    fn zip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (_, a1) = self.split_i64x4(a);
+        let (_, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
+    fn unzip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
+    fn unzip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let lo_lo = self.zip_low_i32x4(a0, b0);
-        let lo_hi = self.zip_high_i32x4(a0, b0);
-        let hi_lo = self.zip_low_i32x4(a1, b1);
-        let hi_hi = self.zip_high_i32x4(a1, b1);
+    fn interleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_lo = self.zip_low_i64x2(a0, b0);
+        let lo_hi = self.zip_high_i64x2(a0, b0);
+        let hi_lo = self.zip_low_i64x2(a1, b1);
+        let hi_hi = self.zip_high_i64x2(a1, b1);
         (
-            self.combine_i32x4(lo_lo, lo_hi),
-            self.combine_i32x4(hi_lo, hi_hi),
+            self.combine_i64x2(lo_lo, lo_hi),
+            self.combine_i64x2(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let lo_even = self.unzip_low_i32x4(a0, a1);
-        let lo_odd = self.unzip_high_i32x4(a0, a1);
-        let hi_even = self.unzip_low_i32x4(b0, b1);
-        let hi_odd = self.unzip_high_i32x4(b0, b1);
+    fn deinterleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_even = self.unzip_low_i64x2(a0, a1);
+        let lo_odd = self.unzip_high_i64x2(a0, a1);
+        let hi_even = self.unzip_low_i64x2(b0, b1);
+        let hi_odd = self.unzip_high_i64x2(b0, b1);
         (
-            self.combine_i32x4(lo_even, hi_even),
-            self.combine_i32x4(lo_odd, hi_odd),
+            self.combine_i64x2(lo_even, hi_even),
+            self.combine_i64x2(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let (c0, c1) = self.split_i32x8(c);
-        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
+    fn select_i64x4(self, a: mask64x4<Self>, b: i64x4<Self>, c: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let (c0, c1) = self.split_i64x4(c);
+        self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
+    fn min_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
+    fn max_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
-        i32x16 {
-            val: crate::support::Aligned512(int32x4x4_t(
+    fn combine_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x8<Self> {
+        i64x8 {
+            val: crate::support::Aligned512(int64x2x4_t(
                 a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
             )),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
+    fn split_i64x4(self, a: i64x4<Self>) -> (i64x2<Self>, i64x2<Self>) {
         (
-            i32x4 {
+            i64x2 {
                 val: crate::support::Aligned128(a.val.0.0),
                 simd: self,
             },
-            i32x4 {
+            i64x2 {
                 val: crate::support::Aligned128(a.val.0.1),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
+    fn neg_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
+    fn reinterpret_u8_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
+    fn reinterpret_u32_i64x4(self, a: i64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i64x4(a);
         self.combine_u32x4(
-            self.reinterpret_u32_i32x4(a0),
-            self.reinterpret_u32_i32x4(a1),
+            self.reinterpret_u32_i64x2(a0),
+            self.reinterpret_u32_i64x2(a1),
         )
     }
     #[inline(always)]
-    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
-    }
-    #[inline(always)]
-    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
-        let half = self.splat_u32x4(val);
-        self.combine_u32x4(half, half)
+    fn splat_u64x4(self, val: u64) -> u64x4<Self> {
+        let half = self.splat_u64x2(val);
+        self.combine_u64x2(half, half)
     }
     #[inline(always)]
-    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
-        crate::transmute::checked_transmute_copy::<uint32x4x2_t, [u32; 8usize]>(&a.val.0)
+    fn as_array_u64x4(self, a: u64x4<Self>) -> [u64; 4usize] {
+        crate::transmute::checked_transmute_copy::<uint64x2x2_t, [u64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
-        crate::transmute::checked_cast_ref::<uint32x4x2_t, [u32; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x4(self, a: &u64x4<Self>) -> &[u64; 4usize] {
+        crate::transmute::checked_cast_ref::<uint64x2x2_t, [u64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
-        crate::transmute::checked_cast_mut::<uint32x4x2_t, [u32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x4(self, a: &mut u64x4<Self>) -> &mut [u64; 4usize] {
+        crate::transmute::checked_cast_mut::<uint64x2x2_t, [u64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+    fn store_array_u64x4(self, a: u64x4<Self>, dest: &mut [u64; 4usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
-        u32x8 {
+    fn cvt_from_bytes_u64x4(self, a: u8x32<Self>) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        if SHIFT >= 8usize {
+    fn slide_u64x4<const SHIFT: usize>(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        if SHIFT >= 4usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_u32x8(a).val.0;
-            let b_bytes = self.cvt_to_bytes_u32x8(b).val.0;
+            let a_bytes = self.cvt_to_bytes_u64x4(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u64x4(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
             let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT * 4usize;
+            let shift_bytes = SHIFT * 8usize;
             uint8x16x2_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -6121,877 +7942,1128 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_u32x8(u8x32 {
+        self.cvt_from_bytes_u64x4(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x4<const SHIFT: usize>(
         self,
-        a: u32x8<Self>,
-        b: u32x8<Self>,
-    ) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(
-            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
+        a: u64x4<Self>,
+        b: u64x4<Self>,
+    ) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(
+            self.slide_within_blocks_u64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x2::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
+    fn add_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
+    fn sub_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
+    fn mul_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
+    fn and_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
+    fn or_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
+    fn xor_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
+    fn not_u64x4(self, a: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1))
     }
     #[inline(always)]
-    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
+    fn shl_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
+    fn shlv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
+    fn shr_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
+    fn shrv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
+    fn simd_eq_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
+    fn simd_lt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
+    fn simd_le_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
+    fn simd_ge_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
+    fn simd_gt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, _) = self.split_u32x8(a);
-        let (b0, _) = self.split_u32x8(b);
-        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
+    fn zip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, _) = self.split_u64x4(a);
+        let (b0, _) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (_, a1) = self.split_u32x8(a);
-        let (_, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
+    fn zip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (_, a1) = self.split_u64x4(a);
+        let (_, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
+    fn unzip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
+    fn unzip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let lo_lo = self.zip_low_u32x4(a0, b0);
-        let lo_hi = self.zip_high_u32x4(a0, b0);
-        let hi_lo = self.zip_low_u32x4(a1, b1);
-        let hi_hi = self.zip_high_u32x4(a1, b1);
+    fn interleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_lo = self.zip_low_u64x2(a0, b0);
+        let lo_hi = self.zip_high_u64x2(a0, b0);
+        let hi_lo = self.zip_low_u64x2(a1, b1);
+        let hi_hi = self.zip_high_u64x2(a1, b1);
         (
-            self.combine_u32x4(lo_lo, lo_hi),
-            self.combine_u32x4(hi_lo, hi_hi),
+            self.combine_u64x2(lo_lo, lo_hi),
+            self.combine_u64x2(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let lo_even = self.unzip_low_u32x4(a0, a1);
-        let lo_odd = self.unzip_high_u32x4(a0, a1);
-        let hi_even = self.unzip_low_u32x4(b0, b1);
-        let hi_odd = self.unzip_high_u32x4(b0, b1);
+    fn deinterleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_even = self.unzip_low_u64x2(a0, a1);
+        let lo_odd = self.unzip_high_u64x2(a0, a1);
+        let hi_even = self.unzip_low_u64x2(b0, b1);
+        let hi_odd = self.unzip_high_u64x2(b0, b1);
         (
-            self.combine_u32x4(lo_even, hi_even),
-            self.combine_u32x4(lo_odd, hi_odd),
+            self.combine_u64x2(lo_even, hi_even),
+            self.combine_u64x2(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u64x4(self, a: mask64x4<Self>, b: u64x4<Self>, c: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let (c0, c1) = self.split_u64x4(c);
+        self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x8<Self> {
+        u64x8 {
+            val: crate::support::Aligned512(uint64x2x4_t(
+                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
+            )),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_u64x4(self, a: u64x4<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (
+            u64x2 {
+                val: crate::support::Aligned128(a.val.0.0),
+                simd: self,
+            },
+            u64x2 {
+                val: crate::support::Aligned128(a.val.0.1),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x4(self, a: u64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u64x2(a0),
+            self.reinterpret_u32_u64x2(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+        let half = self.splat_mask64x2(val);
+        self.combine_mask64x2(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+        mask64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<int64x2x2_t, [i64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        let lo = self.from_bitmask_mask64x2(bits);
+        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
+        self.combine_mask64x2(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x4(a);
+        let lo = self.to_bitmask_mask64x2(lo);
+        let hi = self.to_bitmask_mask64x2(hi);
+        lo | (hi << 2usize)
+    }
+    #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
+    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
+    }
+    #[inline(always)]
+    fn select_mask64x4(
+        self,
+        a: mask64x4<Self>,
+        b: mask64x4<Self>,
+        c: mask64x4<Self>,
+    ) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        let (c0, c1) = self.split_mask64x4(c);
+        self.combine_mask64x2(
+            self.select_mask64x2(a0, b0, c0),
+            self.select_mask64x2(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let (c0, c1) = self.split_u32x8(c);
-        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
+    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
     }
     #[inline(always)]
-    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
+    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
     }
     #[inline(always)]
-    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
+    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
     }
     #[inline(always)]
-    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
-        u32x16 {
-            val: crate::support::Aligned512(uint32x4x4_t(
+    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: crate::support::Aligned512(int64x2x4_t(
                 a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
             )),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
         (
-            u32x4 {
+            mask64x2 {
                 val: crate::support::Aligned128(a.val.0.0),
                 simd: self,
             },
-            u32x4 {
+            mask64x2 {
                 val: crate::support::Aligned128(a.val.0.1),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
+    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
+        let half = self.splat_f32x8(val);
+        self.combine_f32x8(half, half)
     }
     #[inline(always)]
-    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
+    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
-        let half = self.splat_mask32x4(val);
-        self.combine_mask32x4(half, half)
+    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
-        mask32x8 {
-            val: crate::transmute::checked_transmute_copy(&val),
+    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
+        crate::transmute::checked_transmute_copy::<float32x4x4_t, [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
+        crate::transmute::checked_cast_ref::<float32x4x4_t, [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
+        crate::transmute::checked_cast_mut::<float32x4x4_t, [f32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<int32x4x2_t, [i32; 8usize]>(&a.val.0)
+    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
-        let lo = self.from_bitmask_mask32x4(bits);
-        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
-        self.combine_mask32x4(lo, hi)
+    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = {
+            let a_bytes = self.cvt_to_bytes_f32x16(a).val.0;
+            let b_bytes = self.cvt_to_bytes_f32x16(b).val.0;
+            let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
+            let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
+            let shift_bytes = SHIFT * 4usize;
+            uint8x16x4_t(
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        0,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        1,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        2,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        3,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_f32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
-        let (lo, hi) = self.split_mask32x8(a);
-        let lo = self.to_bitmask_mask32x4(lo);
-        let hi = self.to_bitmask_mask32x4(hi);
-        lo | (hi << 4usize)
+    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+        self,
+        a: f32x16<Self>,
+        b: f32x16<Self>,
+    ) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
+        )
     }
     #[inline(always)]
-    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 8usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8usize
-        );
-        let mut lanes = self.as_array_mask32x8(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x8(lanes);
+    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
     }
     #[inline(always)]
-    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
+    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
     }
     #[inline(always)]
-    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
+    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
-    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
     }
     #[inline(always)]
-    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
+    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn select_mask32x8(
-        self,
-        a: mask32x8<Self>,
-        b: mask32x8<Self>,
-        c: mask32x8<Self>,
-    ) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        let (c0, c1) = self.split_mask32x8(c);
-        self.combine_mask32x4(
-            self.select_mask32x4(a0, b0, c0),
-            self.select_mask32x4(a1, b1, c1),
-        )
+    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
+    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
+    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
+    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
+    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
+    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::support::Aligned512(int32x4x4_t(
-                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
-            )),
-            simd: self,
-        }
+    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
-        (
-            mask32x4 {
-                val: crate::support::Aligned128(a.val.0.0),
-                simd: self,
-            },
-            mask32x4 {
-                val: crate::support::Aligned128(a.val.0.1),
-                simd: self,
-            },
-        )
+    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
-        let half = self.splat_f64x2(val);
-        self.combine_f64x2(half, half)
+    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, _) = self.split_f32x16(a);
+        let (b0, _) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
     }
     #[inline(always)]
-    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (_, a1) = self.split_f32x16(a);
+        let (_, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
     }
     #[inline(always)]
-    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
-        crate::transmute::checked_transmute_copy::<float64x2x2_t, [f64; 4usize]>(&a.val.0)
+    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
     }
     #[inline(always)]
-    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
-        crate::transmute::checked_cast_ref::<float64x2x2_t, [f64; 4usize]>(&a.val.0)
+    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_lo = self.zip_low_f32x8(a0, b0);
+        let lo_hi = self.zip_high_f32x8(a0, b0);
+        let hi_lo = self.zip_low_f32x8(a1, b1);
+        let hi_hi = self.zip_high_f32x8(a1, b1);
+        (
+            self.combine_f32x8(lo_lo, lo_hi),
+            self.combine_f32x8(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
-        crate::transmute::checked_cast_mut::<float64x2x2_t, [f64; 4usize]>(&mut a.val.0)
+    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_even = self.unzip_low_f32x8(a0, a1);
+        let lo_odd = self.unzip_high_f32x8(a0, a1);
+        let hi_even = self.unzip_low_f32x8(b0, b1);
+        let hi_odd = self.unzip_high_f32x8(b0, b1);
+        (
+            self.combine_f32x8(lo_even, hi_even),
+            self.combine_f32x8(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.max_precise_f32x8(a0, b0),
+            self.max_precise_f32x8(a1, b1),
+        )
     }
     #[inline(always)]
-    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        if SHIFT >= 4usize {
-            return b;
-        }
-        let result = {
-            let a_bytes = self.cvt_to_bytes_f64x4(a).val.0;
-            let b_bytes = self.cvt_to_bytes_f64x4(b).val.0;
-            let a_blocks = [a_bytes.0, a_bytes.1];
-            let b_blocks = [b_bytes.0, b_bytes.1];
-            let shift_bytes = SHIFT * 8usize;
-            uint8x16x2_t(
-                {
-                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
-                        &a_blocks,
-                        &b_blocks,
-                        0,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-                {
-                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
-                        &a_blocks,
-                        &b_blocks,
-                        1,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-            )
-        };
-        self.cvt_from_bytes_f64x4(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.min_precise_f32x8(a0, b0),
+            self.min_precise_f32x8(a1, b1),
+        )
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x4<const SHIFT: usize>(
-        self,
-        a: f64x4<Self>,
-        b: f64x4<Self>,
-    ) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
+    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_add_f32x8(a0, b0, c0),
+            self.mul_add_f32x8(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
+    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_sub_f32x8(a0, b0, c0),
+            self.mul_sub_f32x8(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
+    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
     }
     #[inline(always)]
-    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
+    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
     }
     #[inline(always)]
-    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(
-            self.approximate_recip_f64x2(a0),
-            self.approximate_recip_f64x2(a1),
+    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.round_ties_even_f32x8(a0),
+            self.round_ties_even_f32x8(a1),
         )
     }
     #[inline(always)]
-    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
+    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
     }
     #[inline(always)]
-    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
+    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
     }
     #[inline(always)]
-    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
+    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
+    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        (
+            f32x8 {
+                val: crate::support::Aligned256(float32x4x2_t(a.val.0.0, a.val.0.1)),
+                simd: self,
+            },
+            f32x8 {
+                val: crate::support::Aligned256(float32x4x2_t(a.val.0.2, a.val.0.3)),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f64x4(
+            self.reinterpret_f64_f32x8(a0),
+            self.reinterpret_f64_f32x8(a1),
+        )
     }
     #[inline(always)]
-    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.reinterpret_i32_f32x8(a0),
+            self.reinterpret_i32_f32x8(a1),
+        )
     }
     #[inline(always)]
-    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        unsafe { vld4q_f32(src.as_ptr()).simd_into(self) }
     }
     #[inline(always)]
-    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) }
     }
     #[inline(always)]
-    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
+    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
     }
     #[inline(always)]
-    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
+    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_f32x8(a0),
+            self.reinterpret_u32_f32x8(a1),
+        )
     }
     #[inline(always)]
-    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, _) = self.split_f64x4(a);
-        let (b0, _) = self.split_f64x4(b);
-        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
+    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
     }
     #[inline(always)]
-    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (_, a1) = self.split_f64x4(a);
-        let (_, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
+    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(
+            self.cvt_u32_precise_f32x8(a0),
+            self.cvt_u32_precise_f32x8(a1),
+        )
     }
     #[inline(always)]
-    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
+    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
     }
     #[inline(always)]
-    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
+    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.cvt_i32_precise_f32x8(a0),
+            self.cvt_i32_precise_f32x8(a1),
+        )
     }
     #[inline(always)]
-    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let lo_lo = self.zip_low_f64x2(a0, b0);
-        let lo_hi = self.zip_high_f64x2(a0, b0);
-        let hi_lo = self.zip_low_f64x2(a1, b1);
-        let hi_hi = self.zip_high_f64x2(a1, b1);
-        (
-            self.combine_f64x2(lo_lo, lo_hi),
-            self.combine_f64x2(hi_lo, hi_hi),
-        )
+    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+        let half = self.splat_i8x32(val);
+        self.combine_i8x32(half, half)
     }
     #[inline(always)]
-    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let lo_even = self.unzip_low_f64x2(a0, a1);
-        let lo_odd = self.unzip_high_f64x2(a0, a1);
-        let hi_even = self.unzip_low_f64x2(b0, b1);
-        let hi_odd = self.unzip_high_f64x2(b0, b1);
-        (
-            self.combine_f64x2(lo_even, hi_even),
-            self.combine_f64x2(lo_odd, hi_odd),
-        )
+    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
+    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
+    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<int8x16x4_t, [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.max_precise_f64x2(a0, b0),
-            self.max_precise_f64x2(a1, b1),
-        )
+    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
+        crate::transmute::checked_cast_ref::<int8x16x4_t, [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.min_precise_f64x2(a0, b0),
-            self.min_precise_f64x2(a1, b1),
-        )
+    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
+        crate::transmute::checked_cast_mut::<int8x16x4_t, [i8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(
-            self.mul_add_f64x2(a0, b0, c0),
-            self.mul_add_f64x2(a1, b1, c1),
-        )
+    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(
-            self.mul_sub_f64x2(a0, b0, c0),
-            self.mul_sub_f64x2(a1, b1, c1),
-        )
+    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
+    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
+    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        if SHIFT >= 64usize {
+            return b;
+        }
+        let result = {
+            let a_bytes = self.cvt_to_bytes_i8x64(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i8x64(b).val.0;
+            let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
+            let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
+            let shift_bytes = SHIFT;
+            uint8x16x4_t(
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        0,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        1,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        2,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        3,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_i8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(
-            self.round_ties_even_f64x2(a0),
-            self.round_ties_even_f64x2(a1),
+    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+        self,
+        a: i8x64<Self>,
+        b: i8x64<Self>,
+    ) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(
+            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
+    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
+    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
+    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
-        f64x8 {
-            val: crate::support::Aligned512(float64x2x4_t(
-                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
-            )),
-            simd: self,
-        }
+    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
-        (
-            f64x2 {
-                val: crate::support::Aligned128(a.val.0.0),
-                simd: self,
-            },
-            f64x2 {
-                val: crate::support::Aligned128(a.val.0.1),
-                simd: self,
-            },
-        )
+    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f32x4(
-            self.reinterpret_f32_f64x2(a0),
-            self.reinterpret_f32_f64x2(a1),
-        )
+    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
     }
     #[inline(always)]
-    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
-        let half = self.splat_mask64x2(val);
-        self.combine_mask64x2(half, half)
+    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
-        mask64x4 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
     }
     #[inline(always)]
-    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
-        crate::transmute::checked_transmute_copy::<int64x2x2_t, [i64; 4usize]>(&a.val.0)
+    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
-        let lo = self.from_bitmask_mask64x2(bits);
-        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
-        self.combine_mask64x2(lo, hi)
+    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
-        let (lo, hi) = self.split_mask64x4(a);
-        let lo = self.to_bitmask_mask64x2(lo);
-        let hi = self.to_bitmask_mask64x2(hi);
-        lo | (hi << 2usize)
+    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 4usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4usize
-        );
-        let mut lanes = self.as_array_mask64x4(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x4(lanes);
+    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
+    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
+    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
+    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, _) = self.split_i8x64(a);
+        let (b0, _) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
     }
     #[inline(always)]
-    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (_, a1) = self.split_i8x64(a);
+        let (_, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn select_mask64x4(
-        self,
-        a: mask64x4<Self>,
-        b: mask64x4<Self>,
-        c: mask64x4<Self>,
-    ) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        let (c0, c1) = self.split_mask64x4(c);
-        self.combine_mask64x2(
-            self.select_mask64x2(a0, b0, c0),
-            self.select_mask64x2(a1, b1, c1),
-        )
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
     }
     #[inline(always)]
-    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
     }
     #[inline(always)]
-    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_lo = self.zip_low_i8x32(a0, b0);
+        let lo_hi = self.zip_high_i8x32(a0, b0);
+        let hi_lo = self.zip_low_i8x32(a1, b1);
+        let hi_hi = self.zip_high_i8x32(a1, b1);
+        (
+            self.combine_i8x32(lo_lo, lo_hi),
+            self.combine_i8x32(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_even = self.unzip_low_i8x32(a0, a1);
+        let lo_odd = self.unzip_high_i8x32(a0, a1);
+        let hi_even = self.unzip_low_i8x32(b0, b1);
+        let hi_odd = self.unzip_high_i8x32(b0, b1);
+        (
+            self.combine_i8x32(lo_even, hi_even),
+            self.combine_i8x32(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
+    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let (c0, c1) = self.split_i8x64(c);
+        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
     }
     #[inline(always)]
-    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
+    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
-        mask64x8 {
-            val: crate::support::Aligned512(int64x2x4_t(
-                a.val.0.0, a.val.0.1, b.val.0.0, b.val.0.1,
-            )),
-            simd: self,
-        }
+    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
         (
-            mask64x2 {
-                val: crate::support::Aligned128(a.val.0.0),
+            i8x32 {
+                val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            mask64x2 {
-                val: crate::support::Aligned128(a.val.0.1),
+            i8x32 {
+                val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
-        let half = self.splat_f32x8(val);
-        self.combine_f32x8(half, half)
+    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i8x32(a0),
+            self.reinterpret_u32_i8x32(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
+        let half = self.splat_u8x32(val);
+        self.combine_u8x32(half, half)
     }
     #[inline(always)]
-    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
+    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
+    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
-        crate::transmute::checked_transmute_copy::<float32x4x4_t, [f32; 16usize]>(&a.val.0)
+    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
+        crate::transmute::checked_transmute_copy::<uint8x16x4_t, [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
-        crate::transmute::checked_cast_ref::<float32x4x4_t, [f32; 16usize]>(&a.val.0)
+    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
+        crate::transmute::checked_cast_ref::<uint8x16x4_t, [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
-        crate::transmute::checked_cast_mut::<float32x4x4_t, [f32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
+        crate::transmute::checked_cast_mut::<uint8x16x4_t, [u8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
-        f32x16 {
+    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        if SHIFT >= 64usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_f32x16(a).val.0;
-            let b_bytes = self.cvt_to_bytes_f32x16(b).val.0;
+            let a_bytes = self.cvt_to_bytes_u8x64(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u8x64(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
-            let shift_bytes = SHIFT * 4usize;
+            let shift_bytes = SHIFT;
             uint8x16x4_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7031,378 +9103,392 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_f32x16(u8x64 {
+        self.cvt_from_bytes_u8x64(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+    fn slide_within_blocks_u8x64<const SHIFT: usize>(
         self,
-        a: f32x16<Self>,
-        b: f32x16<Self>,
-    ) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
+        a: u8x64<Self>,
+        b: u8x64<Self>,
+    ) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(
+            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
+    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
+    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
+    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.approximate_recip_f32x8(a0),
-            self.approximate_recip_f32x8(a1),
-        )
+    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
+    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
+    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
+    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
     }
     #[inline(always)]
-    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
+    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
     }
     #[inline(always)]
-    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
+    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
+    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
     }
     #[inline(always)]
-    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
+    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
+    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
+    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
+    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, _) = self.split_f32x16(a);
-        let (b0, _) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
+    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (_, a1) = self.split_f32x16(a);
-        let (_, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
+    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
+    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, _) = self.split_u8x64(a);
+        let (b0, _) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
     }
     #[inline(always)]
-    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (_, a1) = self.split_u8x64(a);
+        let (_, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_lo = self.zip_low_f32x8(a0, b0);
-        let lo_hi = self.zip_high_f32x8(a0, b0);
-        let hi_lo = self.zip_low_f32x8(a1, b1);
-        let hi_hi = self.zip_high_f32x8(a1, b1);
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_lo = self.zip_low_u8x32(a0, b0);
+        let lo_hi = self.zip_high_u8x32(a0, b0);
+        let hi_lo = self.zip_low_u8x32(a1, b1);
+        let hi_hi = self.zip_high_u8x32(a1, b1);
         (
-            self.combine_f32x8(lo_lo, lo_hi),
-            self.combine_f32x8(hi_lo, hi_hi),
+            self.combine_u8x32(lo_lo, lo_hi),
+            self.combine_u8x32(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_even = self.unzip_low_f32x8(a0, a1);
-        let lo_odd = self.unzip_high_f32x8(a0, a1);
-        let hi_even = self.unzip_low_f32x8(b0, b1);
-        let hi_odd = self.unzip_high_f32x8(b0, b1);
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_even = self.unzip_low_u8x32(a0, a1);
+        let lo_odd = self.unzip_high_u8x32(a0, a1);
+        let hi_even = self.unzip_low_u8x32(b0, b1);
+        let hi_odd = self.unzip_high_u8x32(b0, b1);
         (
-            self.combine_f32x8(lo_even, hi_even),
-            self.combine_f32x8(lo_odd, hi_odd),
+            self.combine_u8x32(lo_even, hi_even),
+            self.combine_u8x32(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
+    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let (c0, c1) = self.split_u8x64(c);
+        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
+    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.max_precise_f32x8(a0, b0),
-            self.max_precise_f32x8(a1, b1),
-        )
+    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.min_precise_f32x8(a0, b0),
-            self.min_precise_f32x8(a1, b1),
+    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        (
+            u8x32 {
+                val: crate::support::Aligned256(uint8x16x2_t(a.val.0.0, a.val.0.1)),
+                simd: self,
+            },
+            u8x32 {
+                val: crate::support::Aligned256(uint8x16x2_t(a.val.0.2, a.val.0.3)),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_add_f32x8(a0, b0, c0),
-            self.mul_add_f32x8(a1, b1, c1),
-        )
+    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
+        unsafe { vld4q_u8(src.as_ptr()).simd_into(self) }
     }
     #[inline(always)]
-    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_sub_f32x8(a0, b0, c0),
-            self.mul_sub_f32x8(a1, b1, c1),
+    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        unsafe { vst4q_u8(dest.as_mut_ptr(), a.into()) }
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u8x32(a0),
+            self.reinterpret_u32_u8x32(a1),
         )
     }
     #[inline(always)]
-    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
+        let half = self.splat_mask8x32(val);
+        self.combine_mask8x32(half, half)
     }
     #[inline(always)]
-    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
+        mask8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.round_ties_even_f32x8(a0),
-            self.round_ties_even_f32x8(a1),
-        )
+    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<int8x16x4_t, [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        let lo = self.from_bitmask_mask8x32(bits);
+        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
+        self.combine_mask8x32(lo, hi)
     }
     #[inline(always)]
-    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
     }
     #[inline(always)]
-    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
     }
     #[inline(always)]
-    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        (
-            f32x8 {
-                val: crate::support::Aligned256(float32x4x2_t(a.val.0.0, a.val.0.1)),
-                simd: self,
-            },
-            f32x8 {
-                val: crate::support::Aligned256(float32x4x2_t(a.val.0.2, a.val.0.3)),
-                simd: self,
-            },
-        )
+    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f64x4(
-            self.reinterpret_f64_f32x8(a0),
-            self.reinterpret_f64_f32x8(a1),
-        )
+    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.reinterpret_i32_f32x8(a0),
-            self.reinterpret_i32_f32x8(a1),
-        )
+    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        unsafe { vld4q_f32(src.as_ptr()).simd_into(self) }
+    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
     }
     #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) }
+    fn select_mask8x64(
+        self,
+        a: mask8x64<Self>,
+        b: mask8x64<Self>,
+        c: mask8x64<Self>,
+    ) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        let (c0, c1) = self.split_mask8x64(c);
+        self.combine_mask8x32(
+            self.select_mask8x32(a0, b0, c0),
+            self.select_mask8x32(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
+    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_f32x8(a0),
-            self.reinterpret_u32_f32x8(a1),
-        )
+    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
+    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.cvt_u32_precise_f32x8(a0),
-            self.cvt_u32_precise_f32x8(a1),
-        )
+    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
+    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.cvt_i32_precise_f32x8(a0),
-            self.cvt_i32_precise_f32x8(a1),
+    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+        (
+            mask8x32 {
+                val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)),
+                simd: self,
+            },
+            mask8x32 {
+                val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
-        let half = self.splat_i8x32(val);
-        self.combine_i8x32(half, half)
+    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
+        let half = self.splat_i16x16(val);
+        self.combine_i16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<int8x16x4_t, [i8; 64usize]>(&a.val.0)
+    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<int16x8x4_t, [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
-        crate::transmute::checked_cast_ref::<int8x16x4_t, [i8; 64usize]>(&a.val.0)
+    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
+        crate::transmute::checked_cast_ref::<int16x8x4_t, [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
-        crate::transmute::checked_cast_mut::<int8x16x4_t, [i8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
+        crate::transmute::checked_cast_mut::<int16x8x4_t, [i16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
-        i8x64 {
+    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
-    #[inline(always)]
-    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        if SHIFT >= 64usize {
+    #[inline(always)]
+    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        if SHIFT >= 32usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_i8x64(a).val.0;
-            let b_bytes = self.cvt_to_bytes_i8x64(b).val.0;
+            let a_bytes = self.cvt_to_bytes_i16x32(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i16x32(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
-            let shift_bytes = SHIFT;
+            let shift_bytes = SHIFT * 2usize;
             uint8x16x4_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7442,277 +9528,286 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_i8x64(u8x64 {
+        self.cvt_from_bytes_i16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+    fn slide_within_blocks_i16x32<const SHIFT: usize>(
         self,
-        a: i8x64<Self>,
-        b: i8x64<Self>,
-    ) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(
-            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
+        a: i16x32<Self>,
+        b: i16x32<Self>,
+    ) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
+    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
+    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
+    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
+    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
+    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
+    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
+    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
     }
     #[inline(always)]
-    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
+    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
+    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
+    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
+    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
+    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
+    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
+    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
+    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, _) = self.split_i8x64(a);
-        let (b0, _) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, _) = self.split_i16x32(a);
+        let (b0, _) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (_, a1) = self.split_i8x64(a);
-        let (_, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (_, a1) = self.split_i16x32(a);
+        let (_, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.unzip_high_i16x16(a0, a1),
+            self.unzip_high_i16x16(b0, b1),
+        )
     }
     #[inline(always)]
-    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_lo = self.zip_low_i8x32(a0, b0);
-        let lo_hi = self.zip_high_i8x32(a0, b0);
-        let hi_lo = self.zip_low_i8x32(a1, b1);
-        let hi_hi = self.zip_high_i8x32(a1, b1);
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_lo = self.zip_low_i16x16(a0, b0);
+        let lo_hi = self.zip_high_i16x16(a0, b0);
+        let hi_lo = self.zip_low_i16x16(a1, b1);
+        let hi_hi = self.zip_high_i16x16(a1, b1);
         (
-            self.combine_i8x32(lo_lo, lo_hi),
-            self.combine_i8x32(hi_lo, hi_hi),
+            self.combine_i16x16(lo_lo, lo_hi),
+            self.combine_i16x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_even = self.unzip_low_i8x32(a0, a1);
-        let lo_odd = self.unzip_high_i8x32(a0, a1);
-        let hi_even = self.unzip_low_i8x32(b0, b1);
-        let hi_odd = self.unzip_high_i8x32(b0, b1);
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_even = self.unzip_low_i16x16(a0, a1);
+        let lo_odd = self.unzip_high_i16x16(a0, a1);
+        let hi_even = self.unzip_low_i16x16(b0, b1);
+        let hi_odd = self.unzip_high_i16x16(b0, b1);
         (
-            self.combine_i8x32(lo_even, hi_even),
-            self.combine_i8x32(lo_odd, hi_odd),
+            self.combine_i16x16(lo_even, hi_even),
+            self.combine_i16x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let (c0, c1) = self.split_i8x64(c);
-        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
+    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let (c0, c1) = self.split_i16x32(c);
+        self.combine_i16x16(
+            self.select_i16x16(a0, b0, c0),
+            self.select_i16x16(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
+    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
+    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        (
-            i8x32 {
-                val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)),
+    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        (
+            i16x16 {
+                val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            i8x32 {
-                val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)),
+            i16x16 {
+                val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
+    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
+    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_i16x16(a0),
+            self.reinterpret_u8_i16x16(a1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i8x64(a);
+    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i16x32(a);
         self.combine_u32x8(
-            self.reinterpret_u32_i8x32(a0),
-            self.reinterpret_u32_i8x32(a1),
+            self.reinterpret_u32_i16x16(a0),
+            self.reinterpret_u32_i16x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
-        let half = self.splat_u8x32(val);
-        self.combine_u8x32(half, half)
+    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
+        let half = self.splat_u16x16(val);
+        self.combine_u16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
-        crate::transmute::checked_transmute_copy::<uint8x16x4_t, [u8; 64usize]>(&a.val.0)
+    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
+        crate::transmute::checked_transmute_copy::<uint16x8x4_t, [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
-        crate::transmute::checked_cast_ref::<uint8x16x4_t, [u8; 64usize]>(&a.val.0)
+    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
+        crate::transmute::checked_cast_ref::<uint16x8x4_t, [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
-        crate::transmute::checked_cast_mut::<uint8x16x4_t, [u8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
+        crate::transmute::checked_cast_mut::<uint16x8x4_t, [u16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        if SHIFT >= 64usize {
+    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        if SHIFT >= 32usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_u8x64(a).val.0;
-            let b_bytes = self.cvt_to_bytes_u8x64(b).val.0;
+            let a_bytes = self.cvt_to_bytes_u16x32(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u16x32(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
-            let shift_bytes = SHIFT;
+            let shift_bytes = SHIFT * 2usize;
             uint8x16x4_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7752,392 +9847,414 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_u8x64(u8x64 {
+        self.cvt_from_bytes_u16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+    fn slide_within_blocks_u16x32<const SHIFT: usize>(
         self,
-        a: u8x64<Self>,
-        b: u8x64<Self>,
-    ) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(
-            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
+        a: u16x32<Self>,
+        b: u16x32<Self>,
+    ) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
+    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
+    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
+    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
+    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
+    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
+    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
+    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
     }
     #[inline(always)]
-    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
+    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
     }
     #[inline(always)]
-    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
+    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
+    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
     }
     #[inline(always)]
-    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
+    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
+    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
+    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
+    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
+    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, _) = self.split_u8x64(a);
-        let (b0, _) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
+    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (_, a1) = self.split_u8x64(a);
-        let (_, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
+    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, _) = self.split_u16x32(a);
+        let (b0, _) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
     }
     #[inline(always)]
-    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (_, a1) = self.split_u16x32(a);
+        let (_, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_lo = self.zip_low_u8x32(a0, b0);
-        let lo_hi = self.zip_high_u8x32(a0, b0);
-        let hi_lo = self.zip_low_u8x32(a1, b1);
-        let hi_hi = self.zip_high_u8x32(a1, b1);
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.unzip_high_u16x16(a0, a1),
+            self.unzip_high_u16x16(b0, b1),
+        )
+    }
+    #[inline(always)]
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_lo = self.zip_low_u16x16(a0, b0);
+        let lo_hi = self.zip_high_u16x16(a0, b0);
+        let hi_lo = self.zip_low_u16x16(a1, b1);
+        let hi_hi = self.zip_high_u16x16(a1, b1);
         (
-            self.combine_u8x32(lo_lo, lo_hi),
-            self.combine_u8x32(hi_lo, hi_hi),
+            self.combine_u16x16(lo_lo, lo_hi),
+            self.combine_u16x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_even = self.unzip_low_u8x32(a0, a1);
-        let lo_odd = self.unzip_high_u8x32(a0, a1);
-        let hi_even = self.unzip_low_u8x32(b0, b1);
-        let hi_odd = self.unzip_high_u8x32(b0, b1);
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_even = self.unzip_low_u16x16(a0, a1);
+        let lo_odd = self.unzip_high_u16x16(a0, a1);
+        let hi_even = self.unzip_low_u16x16(b0, b1);
+        let hi_odd = self.unzip_high_u16x16(b0, b1);
         (
-            self.combine_u8x32(lo_even, hi_even),
-            self.combine_u8x32(lo_odd, hi_odd),
+            self.combine_u16x16(lo_even, hi_even),
+            self.combine_u16x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let (c0, c1) = self.split_u8x64(c);
-        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
+    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let (c0, c1) = self.split_u16x32(c);
+        self.combine_u16x16(
+            self.select_u16x16(a0, b0, c0),
+            self.select_u16x16(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
+    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
+    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
         (
-            u8x32 {
-                val: crate::support::Aligned256(uint8x16x2_t(a.val.0.0, a.val.0.1)),
+            u16x16 {
+                val: crate::support::Aligned256(uint16x8x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            u8x32 {
-                val: crate::support::Aligned256(uint8x16x2_t(a.val.0.2, a.val.0.3)),
+            u16x16 {
+                val: crate::support::Aligned256(uint16x8x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
-        unsafe { vld4q_u8(src.as_ptr()).simd_into(self) }
+    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+        unsafe { vld4q_u16(src.as_ptr()).simd_into(self) }
     }
     #[inline(always)]
-    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
-        unsafe { vst4q_u8(dest.as_mut_ptr(), a.into()) }
+    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        unsafe { vst4q_u16(dest.as_mut_ptr(), a.into()) }
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u8x64(a);
+    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_u16x16(a0),
+            self.reinterpret_u8_u16x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u16x32(a);
         self.combine_u32x8(
-            self.reinterpret_u32_u8x32(a0),
-            self.reinterpret_u32_u8x32(a1),
+            self.reinterpret_u32_u16x16(a0),
+            self.reinterpret_u32_u16x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
-        let half = self.splat_mask8x32(val);
-        self.combine_mask8x32(half, half)
+    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
+        let half = self.splat_mask16x16(val);
+        self.combine_mask16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
-        mask8x64 {
+    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
+        mask16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<int8x16x4_t, [i8; 64usize]>(&a.val.0)
+    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<int16x8x4_t, [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
-        let lo = self.from_bitmask_mask8x32(bits);
-        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
-        self.combine_mask8x32(lo, hi)
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
     }
     #[inline(always)]
-    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
-        let (lo, hi) = self.split_mask8x64(a);
-        let lo = self.to_bitmask_mask8x32(lo);
-        let hi = self.to_bitmask_mask8x32(hi);
-        lo | (hi << 32usize)
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x32(a);
+        let lo = self.to_bitmask_mask16x16(lo);
+        let hi = self.to_bitmask_mask16x16(hi);
+        lo | (hi << 16usize)
     }
     #[inline(always)]
-    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 64usize,
+            index < 32usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            64usize
+            32usize
         );
-        let mut lanes = self.as_array_mask8x64(*a);
+        let mut lanes = self.as_array_mask16x32(*a);
         lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x64(lanes);
+        *a = self.load_array_mask16x32(lanes);
     }
     #[inline(always)]
-    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
+    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
+    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
+    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
+    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
     }
     #[inline(always)]
-    fn select_mask8x64(
+    fn select_mask16x32(
         self,
-        a: mask8x64<Self>,
-        b: mask8x64<Self>,
-        c: mask8x64<Self>,
-    ) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        let (c0, c1) = self.split_mask8x64(c);
-        self.combine_mask8x32(
-            self.select_mask8x32(a0, b0, c0),
-            self.select_mask8x32(a1, b1, c1),
+        a: mask16x32<Self>,
+        b: mask16x32<Self>,
+        c: mask16x32<Self>,
+    ) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        let (c0, c1) = self.split_mask16x32(c);
+        self.combine_mask16x16(
+            self.select_mask16x16(a0, b0, c0),
+            self.select_mask16x16(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
+    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(
+            self.simd_eq_mask16x16(a0, b0),
+            self.simd_eq_mask16x16(a1, b1),
+        )
     }
     #[inline(always)]
-    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
+    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
     }
     #[inline(always)]
-    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
+    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
     }
     #[inline(always)]
-    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
+    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
     }
     #[inline(always)]
-    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
+    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
     }
     #[inline(always)]
-    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
         (
-            mask8x32 {
-                val: crate::support::Aligned256(int8x16x2_t(a.val.0.0, a.val.0.1)),
+            mask16x16 {
+                val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            mask8x32 {
-                val: crate::support::Aligned256(int8x16x2_t(a.val.0.2, a.val.0.3)),
+            mask16x16 {
+                val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
-        let half = self.splat_i16x16(val);
-        self.combine_i16x16(half, half)
+    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
+        let half = self.splat_i32x8(val);
+        self.combine_i32x8(half, half)
     }
     #[inline(always)]
-    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<int16x8x4_t, [i16; 32usize]>(&a.val.0)
+    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<int32x4x4_t, [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
-        crate::transmute::checked_cast_ref::<int16x8x4_t, [i16; 32usize]>(&a.val.0)
+    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
+        crate::transmute::checked_cast_ref::<int32x4x4_t, [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
-        crate::transmute::checked_cast_mut::<int16x8x4_t, [i16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
+        crate::transmute::checked_cast_mut::<int32x4x4_t, [i32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_i16x32(a).val.0;
-            let b_bytes = self.cvt_to_bytes_i16x32(b).val.0;
+            let a_bytes = self.cvt_to_bytes_i32x16(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i32x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
-            let shift_bytes = SHIFT * 2usize;
+            let shift_bytes = SHIFT * 4usize;
             uint8x16x4_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8177,286 +10294,282 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_i16x32(u8x64 {
+        self.cvt_from_bytes_i32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+    fn slide_within_blocks_i32x16<const SHIFT: usize>(
         self,
-        a: i16x32<Self>,
-        b: i16x32<Self>,
-    ) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
+        a: i32x16<Self>,
+        b: i32x16<Self>,
+    ) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(
+            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
+    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
+    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
+    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
+    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
+    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
+    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
+    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
     }
     #[inline(always)]
-    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
+    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
+    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
+    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
+    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
+    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
+    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
+    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
+    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
+    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, _) = self.split_i16x32(a);
-        let (b0, _) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
+    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, _) = self.split_i32x16(a);
+        let (b0, _) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (_, a1) = self.split_i16x32(a);
-        let (_, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
+    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (_, a1) = self.split_i32x16(a);
+        let (_, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
+    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.unzip_high_i16x16(a0, a1),
-            self.unzip_high_i16x16(b0, b1),
-        )
+    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_lo = self.zip_low_i16x16(a0, b0);
-        let lo_hi = self.zip_high_i16x16(a0, b0);
-        let hi_lo = self.zip_low_i16x16(a1, b1);
-        let hi_hi = self.zip_high_i16x16(a1, b1);
+    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_lo = self.zip_low_i32x8(a0, b0);
+        let lo_hi = self.zip_high_i32x8(a0, b0);
+        let hi_lo = self.zip_low_i32x8(a1, b1);
+        let hi_hi = self.zip_high_i32x8(a1, b1);
         (
-            self.combine_i16x16(lo_lo, lo_hi),
-            self.combine_i16x16(hi_lo, hi_hi),
+            self.combine_i32x8(lo_lo, lo_hi),
+            self.combine_i32x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_even = self.unzip_low_i16x16(a0, a1);
-        let lo_odd = self.unzip_high_i16x16(a0, a1);
-        let hi_even = self.unzip_low_i16x16(b0, b1);
-        let hi_odd = self.unzip_high_i16x16(b0, b1);
+    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_even = self.unzip_low_i32x8(a0, a1);
+        let lo_odd = self.unzip_high_i32x8(a0, a1);
+        let hi_even = self.unzip_low_i32x8(b0, b1);
+        let hi_odd = self.unzip_high_i32x8(b0, b1);
         (
-            self.combine_i16x16(lo_even, hi_even),
-            self.combine_i16x16(lo_odd, hi_odd),
+            self.combine_i32x8(lo_even, hi_even),
+            self.combine_i32x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let (c0, c1) = self.split_i16x32(c);
-        self.combine_i16x16(
-            self.select_i16x16(a0, b0, c0),
-            self.select_i16x16(a1, b1, c1),
-        )
+    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let (c0, c1) = self.split_i32x16(c);
+        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
+    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
+    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
         (
-            i16x16 {
-                val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)),
+            i32x8 {
+                val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            i16x16 {
-                val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)),
+            i32x8 {
+                val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
+    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_i16x16(a0),
-            self.reinterpret_u8_i16x16(a1),
-        )
+    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i16x32(a);
+    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
         self.combine_u32x8(
-            self.reinterpret_u32_i16x16(a0),
-            self.reinterpret_u32_i16x16(a1),
+            self.reinterpret_u32_i32x8(a0),
+            self.reinterpret_u32_i32x8(a1),
         )
     }
     #[inline(always)]
-    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
-        let half = self.splat_u16x16(val);
-        self.combine_u16x16(half, half)
+    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
     }
     #[inline(always)]
-    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+        let half = self.splat_u32x8(val);
+        self.combine_u32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
-        crate::transmute::checked_transmute_copy::<uint16x8x4_t, [u16; 32usize]>(&a.val.0)
+    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
+        crate::transmute::checked_transmute_copy::<uint32x4x4_t, [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
-        crate::transmute::checked_cast_ref::<uint16x8x4_t, [u16; 32usize]>(&a.val.0)
+    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
+        crate::transmute::checked_cast_ref::<uint32x4x4_t, [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
-        crate::transmute::checked_cast_mut::<uint16x8x4_t, [u16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
+        crate::transmute::checked_cast_mut::<uint32x4x4_t, [u32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_u16x32(a).val.0;
-            let b_bytes = self.cvt_to_bytes_u16x32(b).val.0;
+            let a_bytes = self.cvt_to_bytes_u32x16(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u32x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
-            let shift_bytes = SHIFT * 2usize;
+            let shift_bytes = SHIFT * 4usize;
             uint8x16x4_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8496,414 +10609,394 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_u16x32(u8x64 {
+        self.cvt_from_bytes_u32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+    fn slide_within_blocks_u32x16<const SHIFT: usize>(
         self,
-        a: u16x32<Self>,
-        b: u16x32<Self>,
-    ) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
+        a: u32x16<Self>,
+        b: u32x16<Self>,
+    ) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(
+            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
+    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
+    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
+    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
+    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
+    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
+    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
+    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
     }
     #[inline(always)]
-    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
+    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
+    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
+    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
+    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
+    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
+    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
+    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
+    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
+    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, _) = self.split_u16x32(a);
-        let (b0, _) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
+    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, _) = self.split_u32x16(a);
+        let (b0, _) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (_, a1) = self.split_u16x32(a);
-        let (_, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
+    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (_, a1) = self.split_u32x16(a);
+        let (_, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
+    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.unzip_high_u16x16(a0, a1),
-            self.unzip_high_u16x16(b0, b1),
-        )
+    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_lo = self.zip_low_u16x16(a0, b0);
-        let lo_hi = self.zip_high_u16x16(a0, b0);
-        let hi_lo = self.zip_low_u16x16(a1, b1);
-        let hi_hi = self.zip_high_u16x16(a1, b1);
+    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_lo = self.zip_low_u32x8(a0, b0);
+        let lo_hi = self.zip_high_u32x8(a0, b0);
+        let hi_lo = self.zip_low_u32x8(a1, b1);
+        let hi_hi = self.zip_high_u32x8(a1, b1);
         (
-            self.combine_u16x16(lo_lo, lo_hi),
-            self.combine_u16x16(hi_lo, hi_hi),
+            self.combine_u32x8(lo_lo, lo_hi),
+            self.combine_u32x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_even = self.unzip_low_u16x16(a0, a1);
-        let lo_odd = self.unzip_high_u16x16(a0, a1);
-        let hi_even = self.unzip_low_u16x16(b0, b1);
-        let hi_odd = self.unzip_high_u16x16(b0, b1);
+    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_even = self.unzip_low_u32x8(a0, a1);
+        let lo_odd = self.unzip_high_u32x8(a0, a1);
+        let hi_even = self.unzip_low_u32x8(b0, b1);
+        let hi_odd = self.unzip_high_u32x8(b0, b1);
         (
-            self.combine_u16x16(lo_even, hi_even),
-            self.combine_u16x16(lo_odd, hi_odd),
+            self.combine_u32x8(lo_even, hi_even),
+            self.combine_u32x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let (c0, c1) = self.split_u16x32(c);
-        self.combine_u16x16(
-            self.select_u16x16(a0, b0, c0),
-            self.select_u16x16(a1, b1, c1),
-        )
+    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let (c0, c1) = self.split_u32x16(c);
+        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
+    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
+    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
         (
-            u16x16 {
-                val: crate::support::Aligned256(uint16x8x2_t(a.val.0.0, a.val.0.1)),
+            u32x8 {
+                val: crate::support::Aligned256(uint32x4x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            u16x16 {
-                val: crate::support::Aligned256(uint16x8x2_t(a.val.0.2, a.val.0.3)),
+            u32x8 {
+                val: crate::support::Aligned256(uint32x4x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
-        unsafe { vld4q_u16(src.as_ptr()).simd_into(self) }
-    }
-    #[inline(always)]
-    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
-        unsafe { vst4q_u16(dest.as_mut_ptr(), a.into()) }
+    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
+        unsafe { vld4q_u32(src.as_ptr()).simd_into(self) }
     }
     #[inline(always)]
-    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
+    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        unsafe { vst4q_u32(dest.as_mut_ptr(), a.into()) }
     }
     #[inline(always)]
-    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_u16x16(a0),
-            self.reinterpret_u8_u16x16(a1),
-        )
+    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_u16x16(a0),
-            self.reinterpret_u32_u16x16(a1),
-        )
+    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
     }
     #[inline(always)]
-    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
-        let half = self.splat_mask16x16(val);
-        self.combine_mask16x16(half, half)
+    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
+        let half = self.splat_mask32x8(val);
+        self.combine_mask32x8(half, half)
     }
     #[inline(always)]
-    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
-        mask16x32 {
+    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
+        mask32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<int16x8x4_t, [i16; 32usize]>(&a.val.0)
+    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<int32x4x4_t, [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
-        let lo = self.from_bitmask_mask16x16(bits);
-        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
-        self.combine_mask16x16(lo, hi)
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
     }
     #[inline(always)]
-    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
-        let (lo, hi) = self.split_mask16x32(a);
-        let lo = self.to_bitmask_mask16x16(lo);
-        let hi = self.to_bitmask_mask16x16(hi);
-        lo | (hi << 16usize)
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
     }
     #[inline(always)]
-    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 32usize,
+            index < 16usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            32usize
+            16usize
         );
-        let mut lanes = self.as_array_mask16x32(*a);
+        let mut lanes = self.as_array_mask32x16(*a);
         lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x32(lanes);
+        *a = self.load_array_mask32x16(lanes);
     }
     #[inline(always)]
-    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
+    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
+    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
+    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
+    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
     }
     #[inline(always)]
-    fn select_mask16x32(
+    fn select_mask32x16(
         self,
-        a: mask16x32<Self>,
-        b: mask16x32<Self>,
-        c: mask16x32<Self>,
-    ) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        let (c0, c1) = self.split_mask16x32(c);
-        self.combine_mask16x16(
-            self.select_mask16x16(a0, b0, c0),
-            self.select_mask16x16(a1, b1, c1),
+        a: mask32x16<Self>,
+        b: mask32x16<Self>,
+        c: mask32x16<Self>,
+    ) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        let (c0, c1) = self.split_mask32x16(c);
+        self.combine_mask32x8(
+            self.select_mask32x8(a0, b0, c0),
+            self.select_mask32x8(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(
-            self.simd_eq_mask16x16(a0, b0),
-            self.simd_eq_mask16x16(a1, b1),
-        )
+    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
+    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
+    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
+    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
+    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
         (
-            mask16x16 {
-                val: crate::support::Aligned256(int16x8x2_t(a.val.0.0, a.val.0.1)),
+            mask32x8 {
+                val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            mask16x16 {
-                val: crate::support::Aligned256(int16x8x2_t(a.val.0.2, a.val.0.3)),
+            mask32x8 {
+                val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
-        let half = self.splat_i32x8(val);
-        self.combine_i32x8(half, half)
+    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
+        let half = self.splat_f64x4(val);
+        self.combine_f64x4(half, half)
     }
     #[inline(always)]
-    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<int32x4x4_t, [i32; 16usize]>(&a.val.0)
+    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
+        crate::transmute::checked_transmute_copy::<float64x2x4_t, [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
-        crate::transmute::checked_cast_ref::<int32x4x4_t, [i32; 16usize]>(&a.val.0)
+    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
+        crate::transmute::checked_cast_ref::<float64x2x4_t, [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
-        crate::transmute::checked_cast_mut::<int32x4x4_t, [i32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
+        crate::transmute::checked_cast_mut::<float64x2x4_t, [f64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
+    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
-        i32x16 {
+    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_i32x16(a).val.0;
-            let b_bytes = self.cvt_to_bytes_i32x16(b).val.0;
+            let a_bytes = self.cvt_to_bytes_f64x8(a).val.0;
+            let b_bytes = self.cvt_to_bytes_f64x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
-            let shift_bytes = SHIFT * 4usize;
+            let shift_bytes = SHIFT * 8usize;
             uint8x16x4_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8943,282 +11036,323 @@ impl Simd for Neon {
                 },
             )
         };
-        self.cvt_from_bytes_i32x16(u8x64 {
+        self.cvt_from_bytes_f64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x16<const SHIFT: usize>(
+    fn slide_within_blocks_f64x8<const SHIFT: usize>(
         self,
-        a: i32x16<Self>,
-        b: i32x16<Self>,
-    ) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(
-            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
+        a: f64x8<Self>,
+        b: f64x8<Self>,
+    ) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
+    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
     }
     #[inline(always)]
-    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
+    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
     }
     #[inline(always)]
-    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
+    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
-    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
+        )
     }
     #[inline(always)]
-    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
+    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
+    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
+    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
+    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
+    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
+    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
+    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
+    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
+    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
+    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
+    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, _) = self.split_f64x8(a);
+        let (b0, _) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
     }
     #[inline(always)]
-    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
+    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (_, a1) = self.split_f64x8(a);
+        let (_, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, _) = self.split_i32x16(a);
-        let (b0, _) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
+    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_lo = self.zip_low_f64x4(a0, b0);
+        let lo_hi = self.zip_high_f64x4(a0, b0);
+        let hi_lo = self.zip_low_f64x4(a1, b1);
+        let hi_hi = self.zip_high_f64x4(a1, b1);
+        (
+            self.combine_f64x4(lo_lo, lo_hi),
+            self.combine_f64x4(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_even = self.unzip_low_f64x4(a0, a1);
+        let lo_odd = self.unzip_high_f64x4(a0, a1);
+        let hi_even = self.unzip_low_f64x4(b0, b1);
+        let hi_odd = self.unzip_high_f64x4(b0, b1);
+        (
+            self.combine_f64x4(lo_even, hi_even),
+            self.combine_f64x4(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.max_precise_f64x4(a0, b0),
+            self.max_precise_f64x4(a1, b1),
+        )
     }
     #[inline(always)]
-    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (_, a1) = self.split_i32x16(a);
-        let (_, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
+    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.min_precise_f64x4(a0, b0),
+            self.min_precise_f64x4(a1, b1),
+        )
     }
     #[inline(always)]
-    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
+    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_add_f64x4(a0, b0, c0),
+            self.mul_add_f64x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
+    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_sub_f64x4(a0, b0, c0),
+            self.mul_sub_f64x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_lo = self.zip_low_i32x8(a0, b0);
-        let lo_hi = self.zip_high_i32x8(a0, b0);
-        let hi_lo = self.zip_low_i32x8(a1, b1);
-        let hi_hi = self.zip_high_i32x8(a1, b1);
-        (
-            self.combine_i32x8(lo_lo, lo_hi),
-            self.combine_i32x8(hi_lo, hi_hi),
-        )
+    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
     }
     #[inline(always)]
-    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_even = self.unzip_low_i32x8(a0, a1);
-        let lo_odd = self.unzip_high_i32x8(a0, a1);
-        let hi_even = self.unzip_low_i32x8(b0, b1);
-        let hi_odd = self.unzip_high_i32x8(b0, b1);
-        (
-            self.combine_i32x8(lo_even, hi_even),
-            self.combine_i32x8(lo_odd, hi_odd),
+    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.round_ties_even_f64x4(a0),
+            self.round_ties_even_f64x4(a1),
         )
     }
     #[inline(always)]
-    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let (c0, c1) = self.split_i32x16(c);
-        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
+    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
     }
     #[inline(always)]
-    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
+    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
     }
     #[inline(always)]
-    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
+    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
+    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
         (
-            i32x8 {
-                val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)),
+            f64x4 {
+                val: crate::support::Aligned256(float64x2x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            i32x8 {
-                val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)),
+            f64x4 {
+                val: crate::support::Aligned256(float64x2x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_i32x8(a0),
-            self.reinterpret_u32_i32x8(a1),
+    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x8(
+            self.reinterpret_f32_f64x4(a0),
+            self.reinterpret_f32_f64x4(a1),
         )
     }
     #[inline(always)]
-    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
-    }
-    #[inline(always)]
-    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
-        let half = self.splat_u32x8(val);
-        self.combine_u32x8(half, half)
+    fn splat_i64x8(self, val: i64) -> i64x8<Self> {
+        let half = self.splat_i64x4(val);
+        self.combine_i64x4(half, half)
     }
     #[inline(always)]
-    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
+    fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
+    fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
-        crate::transmute::checked_transmute_copy::<uint32x4x4_t, [u32; 16usize]>(&a.val.0)
+    fn as_array_i64x8(self, a: i64x8<Self>) -> [i64; 8usize] {
+        crate::transmute::checked_transmute_copy::<int64x2x4_t, [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
-        crate::transmute::checked_cast_ref::<uint32x4x4_t, [u32; 16usize]>(&a.val.0)
+    fn as_array_ref_i64x8(self, a: &i64x8<Self>) -> &[i64; 8usize] {
+        crate::transmute::checked_cast_ref::<int64x2x4_t, [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
-        crate::transmute::checked_cast_mut::<uint32x4x4_t, [u32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_i64x8(self, a: &mut i64x8<Self>) -> &mut [i64; 8usize] {
+        crate::transmute::checked_cast_mut::<int64x2x4_t, [i64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+    fn store_array_i64x8(self, a: i64x8<Self>, dest: &mut [i64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
-        u32x16 {
+    fn cvt_from_bytes_i64x8(self, a: u8x64<Self>) -> i64x8<Self> {
+        i64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_i64x8<const SHIFT: usize>(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_u32x16(a).val.0;
-            let b_bytes = self.cvt_to_bytes_u32x16(b).val.0;
+            let a_bytes = self.cvt_to_bytes_i64x8(a).val.0;
+            let b_bytes = self.cvt_to_bytes_i64x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
-            let shift_bytes = SHIFT * 4usize;
+            let shift_bytes = SHIFT * 8usize;
             uint8x16x4_t(
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -9254,395 +11388,278 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-            )
-        };
-        self.cvt_from_bytes_u32x16(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn slide_within_blocks_u32x16<const SHIFT: usize>(
-        self,
-        a: u32x16<Self>,
-        b: u32x16<Self>,
-    ) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(
-            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
-    }
-    #[inline(always)]
-    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_i64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
+    fn slide_within_blocks_i64x8<const SHIFT: usize>(
+        self,
+        a: i64x8<Self>,
+        b: i64x8<Self>,
+    ) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(
+            self.slide_within_blocks_i64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x4::<SHIFT>(a1, b1),
+        )
     }
     #[inline(always)]
-    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
+    fn add_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, _) = self.split_u32x16(a);
-        let (b0, _) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
+    fn sub_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (_, a1) = self.split_u32x16(a);
-        let (_, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
+    fn mul_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
+    fn and_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
+    fn or_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_lo = self.zip_low_u32x8(a0, b0);
-        let lo_hi = self.zip_high_u32x8(a0, b0);
-        let hi_lo = self.zip_low_u32x8(a1, b1);
-        let hi_hi = self.zip_high_u32x8(a1, b1);
-        (
-            self.combine_u32x8(lo_lo, lo_hi),
-            self.combine_u32x8(hi_lo, hi_hi),
-        )
+    fn xor_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_even = self.unzip_low_u32x8(a0, a1);
-        let lo_odd = self.unzip_high_u32x8(a0, a1);
-        let hi_even = self.unzip_low_u32x8(b0, b1);
-        let hi_odd = self.unzip_high_u32x8(b0, b1);
-        (
-            self.combine_u32x8(lo_even, hi_even),
-            self.combine_u32x8(lo_odd, hi_odd),
-        )
+    fn not_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1))
     }
     #[inline(always)]
-    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let (c0, c1) = self.split_u32x16(c);
-        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
+    fn shl_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
+    fn shlv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
+    fn shr_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        (
-            u32x8 {
-                val: crate::support::Aligned256(uint32x4x2_t(a.val.0.0, a.val.0.1)),
-                simd: self,
-            },
-            u32x8 {
-                val: crate::support::Aligned256(uint32x4x2_t(a.val.0.2, a.val.0.3)),
-                simd: self,
-            },
-        )
+    fn shrv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
-        unsafe { vld4q_u32(src.as_ptr()).simd_into(self) }
+    fn simd_eq_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        unsafe { vst4q_u32(dest.as_mut_ptr(), a.into()) }
+    fn simd_lt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
+    fn simd_le_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
+    fn simd_ge_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
-        let half = self.splat_mask32x8(val);
-        self.combine_mask32x8(half, half)
+    fn simd_gt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn zip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, _) = self.split_i64x8(a);
+        let (b0, _) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0))
     }
     #[inline(always)]
-    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<int32x4x4_t, [i32; 16usize]>(&a.val.0)
+    fn zip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (_, a1) = self.split_i64x8(a);
+        let (_, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
-        let lo = self.from_bitmask_mask32x8(bits);
-        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
-        self.combine_mask32x8(lo, hi)
+    fn unzip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
-        let (lo, hi) = self.split_mask32x16(a);
-        let lo = self.to_bitmask_mask32x8(lo);
-        let hi = self.to_bitmask_mask32x8(hi);
-        lo | (hi << 8usize)
+    fn unzip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let mut lanes = self.as_array_mask32x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x16(lanes);
+    fn interleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_lo = self.zip_low_i64x4(a0, b0);
+        let lo_hi = self.zip_high_i64x4(a0, b0);
+        let hi_lo = self.zip_low_i64x4(a1, b1);
+        let hi_hi = self.zip_high_i64x4(a1, b1);
+        (
+            self.combine_i64x4(lo_lo, lo_hi),
+            self.combine_i64x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
+    fn deinterleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_even = self.unzip_low_i64x4(a0, a1);
+        let lo_odd = self.unzip_high_i64x4(a0, a1);
+        let hi_even = self.unzip_low_i64x4(b0, b1);
+        let hi_odd = self.unzip_high_i64x4(b0, b1);
+        (
+            self.combine_i64x4(lo_even, hi_even),
+            self.combine_i64x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
+    fn select_i64x8(self, a: mask64x8<Self>, b: i64x8<Self>, c: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let (c0, c1) = self.split_i64x8(c);
+        self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
+    fn min_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
+    fn max_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_mask32x16(
-        self,
-        a: mask32x16<Self>,
-        b: mask32x16<Self>,
-        c: mask32x16<Self>,
-    ) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        let (c0, c1) = self.split_mask32x16(c);
-        self.combine_mask32x8(
-            self.select_mask32x8(a0, b0, c0),
-            self.select_mask32x8(a1, b1, c1),
+    fn split_i64x8(self, a: i64x8<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        (
+            i64x4 {
+                val: crate::support::Aligned256(int64x2x2_t(a.val.0.0, a.val.0.1)),
+                simd: self,
+            },
+            i64x4 {
+                val: crate::support::Aligned256(int64x2x2_t(a.val.0.2, a.val.0.3)),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
-    }
-    #[inline(always)]
-    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
-    }
-    #[inline(always)]
-    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
+    fn neg_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1))
     }
     #[inline(always)]
-    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
+    fn reinterpret_u8_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1))
     }
     #[inline(always)]
-    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
-        (
-            mask32x8 {
-                val: crate::support::Aligned256(int32x4x2_t(a.val.0.0, a.val.0.1)),
-                simd: self,
-            },
-            mask32x8 {
-                val: crate::support::Aligned256(int32x4x2_t(a.val.0.2, a.val.0.3)),
-                simd: self,
-            },
+    fn reinterpret_u32_i64x8(self, a: i64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i64x4(a0),
+            self.reinterpret_u32_i64x4(a1),
         )
     }
     #[inline(always)]
-    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
-        let half = self.splat_f64x4(val);
-        self.combine_f64x4(half, half)
+    fn splat_u64x8(self, val: u64) -> u64x8<Self> {
+        let half = self.splat_u64x4(val);
+        self.combine_u64x4(half, half)
     }
     #[inline(always)]
-    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
-        crate::transmute::checked_transmute_copy::<float64x2x4_t, [f64; 8usize]>(&a.val.0)
+    fn as_array_u64x8(self, a: u64x8<Self>) -> [u64; 8usize] {
+        crate::transmute::checked_transmute_copy::<uint64x2x4_t, [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
-        crate::transmute::checked_cast_ref::<float64x2x4_t, [f64; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x8(self, a: &u64x8<Self>) -> &[u64; 8usize] {
+        crate::transmute::checked_cast_ref::<uint64x2x4_t, [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
-        crate::transmute::checked_cast_mut::<float64x2x4_t, [f64; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x8(self, a: &mut u64x8<Self>) -> &mut [u64; 8usize] {
+        crate::transmute::checked_cast_mut::<uint64x2x4_t, [u64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+    fn store_array_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn cvt_from_bytes_u64x8(self, a: u8x64<Self>) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn slide_u64x8<const SHIFT: usize>(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = {
-            let a_bytes = self.cvt_to_bytes_f64x8(a).val.0;
-            let b_bytes = self.cvt_to_bytes_f64x8(b).val.0;
+            let a_bytes = self.cvt_to_bytes_u64x8(a).val.0;
+            let b_bytes = self.cvt_to_bytes_u64x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
             let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3];
             let shift_bytes = SHIFT * 8usize;
@@ -9670,276 +11687,233 @@ impl Simd for Neon {
                         &a_blocks,
                         &b_blocks,
                         2,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-                {
-                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
-                        &a_blocks,
-                        &b_blocks,
-                        3,
-                        shift_bytes,
-                    );
-                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
-                },
-            )
-        };
-        self.cvt_from_bytes_f64x8(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn slide_within_blocks_f64x8<const SHIFT: usize>(
-        self,
-        a: f64x8<Self>,
-        b: f64x8<Self>,
-    ) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
-    }
-    #[inline(always)]
-    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
-    }
-    #[inline(always)]
-    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+                {
+                    let [lo, hi] = crate::support::cross_block_slide_blocks_at(
+                        &a_blocks,
+                        &b_blocks,
+                        3,
+                        shift_bytes,
+                    );
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
+                },
+            )
+        };
+        self.cvt_from_bytes_u64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.approximate_recip_f64x4(a0),
-            self.approximate_recip_f64x4(a1),
+    fn slide_within_blocks_u64x8<const SHIFT: usize>(
+        self,
+        a: u64x8<Self>,
+        b: u64x8<Self>,
+    ) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(
+            self.slide_within_blocks_u64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
+    fn add_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
+    fn sub_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
+    fn mul_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
+    fn and_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
+    fn or_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
+    fn xor_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
+    fn not_u64x8(self, a: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1))
     }
     #[inline(always)]
-    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
+    fn shl_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
+    fn shlv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, _) = self.split_f64x8(a);
-        let (b0, _) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
+    fn shr_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (_, a1) = self.split_f64x8(a);
-        let (_, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
+    fn shrv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
+    fn simd_eq_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
+    fn simd_lt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_lo = self.zip_low_f64x4(a0, b0);
-        let lo_hi = self.zip_high_f64x4(a0, b0);
-        let hi_lo = self.zip_low_f64x4(a1, b1);
-        let hi_hi = self.zip_high_f64x4(a1, b1);
-        (
-            self.combine_f64x4(lo_lo, lo_hi),
-            self.combine_f64x4(hi_lo, hi_hi),
-        )
+    fn simd_le_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_even = self.unzip_low_f64x4(a0, a1);
-        let lo_odd = self.unzip_high_f64x4(a0, a1);
-        let hi_even = self.unzip_low_f64x4(b0, b1);
-        let hi_odd = self.unzip_high_f64x4(b0, b1);
-        (
-            self.combine_f64x4(lo_even, hi_even),
-            self.combine_f64x4(lo_odd, hi_odd),
-        )
+    fn simd_ge_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
+    fn simd_gt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
+    fn zip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, _) = self.split_u64x8(a);
+        let (b0, _) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0))
     }
     #[inline(always)]
-    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.max_precise_f64x4(a0, b0),
-            self.max_precise_f64x4(a1, b1),
-        )
+    fn zip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (_, a1) = self.split_u64x8(a);
+        let (_, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.min_precise_f64x4(a0, b0),
-            self.min_precise_f64x4(a1, b1),
-        )
+    fn unzip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_add_f64x4(a0, b0, c0),
-            self.mul_add_f64x4(a1, b1, c1),
-        )
+    fn unzip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_sub_f64x4(a0, b0, c0),
-            self.mul_sub_f64x4(a1, b1, c1),
+    fn interleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_lo = self.zip_low_u64x4(a0, b0);
+        let lo_hi = self.zip_high_u64x4(a0, b0);
+        let hi_lo = self.zip_low_u64x4(a1, b1);
+        let hi_hi = self.zip_high_u64x4(a1, b1);
+        (
+            self.combine_u64x4(lo_lo, lo_hi),
+            self.combine_u64x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
-    }
-    #[inline(always)]
-    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
-    }
-    #[inline(always)]
-    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.round_ties_even_f64x4(a0),
-            self.round_ties_even_f64x4(a1),
+    fn deinterleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_even = self.unzip_low_u64x4(a0, a1);
+        let lo_odd = self.unzip_high_u64x4(a0, a1);
+        let hi_even = self.unzip_low_u64x4(b0, b1);
+        let hi_odd = self.unzip_high_u64x4(b0, b1);
+        (
+            self.combine_u64x4(lo_even, hi_even),
+            self.combine_u64x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
+    fn select_u64x8(self, a: mask64x8<Self>, b: u64x8<Self>, c: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let (c0, c1) = self.split_u64x8(c);
+        self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
+    fn min_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_mask64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
+    fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>) {
         (
-            f64x4 {
-                val: crate::support::Aligned256(float64x2x2_t(a.val.0.0, a.val.0.1)),
+            u64x4 {
+                val: crate::support::Aligned256(uint64x2x2_t(a.val.0.0, a.val.0.1)),
                 simd: self,
             },
-            f64x4 {
-                val: crate::support::Aligned256(float64x2x2_t(a.val.0.2, a.val.0.3)),
+            u64x4 {
+                val: crate::support::Aligned256(uint64x2x2_t(a.val.0.2, a.val.0.3)),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f32x8(
-            self.reinterpret_f32_f64x4(a0),
-            self.reinterpret_f32_f64x4(a1),
+    fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self> {
+        unsafe { vld4q_u64(src.as_ptr()).simd_into(self) }
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
+        unsafe { vst4q_u64(dest.as_mut_ptr(), a.into()) }
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x8(self, a: u64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u64x4(a0),
+            self.reinterpret_u32_u64x4(a1),
         )
     }
     #[inline(always)]
@@ -10225,6 +12199,36 @@ impl<S: Simd> From<f64x2<S>> for float64x2_t {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
+impl<S: Simd> SimdFrom<int64x2_t, S> for i64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: int64x2_t) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i64x2<S>> for int64x2_t {
+    #[inline(always)]
+    fn from(value: i64x2<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<uint64x2_t, S> for u64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: uint64x2_t) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u64x2<S>> for uint64x2_t {
+    #[inline(always)]
+    fn from(value: u64x2<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
 impl<S: Simd> SimdFrom<int64x2_t, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int64x2_t) -> Self {
@@ -10405,6 +12409,36 @@ impl<S: Simd> From<f64x4<S>> for float64x2x2_t {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
+impl<S: Simd> SimdFrom<int64x2x2_t, S> for i64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: int64x2x2_t) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i64x4<S>> for int64x2x2_t {
+    #[inline(always)]
+    fn from(value: i64x4<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<uint64x2x2_t, S> for u64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: uint64x2x2_t) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u64x4<S>> for uint64x2x2_t {
+    #[inline(always)]
+    fn from(value: u64x4<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
 impl<S: Simd> SimdFrom<int64x2x2_t, S> for mask64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int64x2x2_t) -> Self {
@@ -10585,6 +12619,36 @@ impl<S: Simd> From<f64x8<S>> for float64x2x4_t {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
+impl<S: Simd> SimdFrom<int64x2x4_t, S> for i64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: int64x2x4_t) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i64x8<S>> for int64x2x4_t {
+    #[inline(always)]
+    fn from(value: i64x8<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<uint64x2x4_t, S> for u64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: uint64x2x4_t) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u64x8<S>> for uint64x2x4_t {
+    #[inline(always)]
+    fn from(value: u64x8<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
 impl<S: Simd> SimdFrom<int64x2x4_t, S> for mask64x8<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: int64x2x4_t) -> Self {
diff --git a/fearless_simd/src/generated/ops.rs b/fearless_simd/src/generated/ops.rs
index b05d99186..53c6f9e92 100644
--- a/fearless_simd/src/generated/ops.rs
+++ b/fearless_simd/src/generated/ops.rs
@@ -6,9 +6,9 @@
 use crate::{Simd, SimdInto};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 impl<S: Simd> core::ops::Neg for f32x4<S> {
     type Output = Self;
@@ -2145,4380 +2145,6060 @@ impl<S: Simd> core::ops::Div<f64x2<S>> for f64 {
         rhs.simd.div_f64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for mask64x2<S> {
+impl<S: Simd> core::ops::Neg for i64x2<S> {
     type Output = Self;
-    #[doc = "Compute the logical AND of two masks."]
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
     #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask64x2(self, rhs)
+    fn neg(self) -> Self::Output {
+        self.simd.neg_i64x2(self)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for mask64x2<S> {
-    #[doc = "Compute the logical AND of two masks."]
+impl<S: Simd> core::ops::Add for i64x2<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask64x2(*self, rhs);
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for mask64x2<S> {
+impl<S: Simd> core::ops::AddAssign for i64x2<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_i64x2(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<i64> for i64x2<S> {
     type Output = Self;
-    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask64x2(self, rhs)
+    fn add(self, rhs: i64) -> Self::Output {
+        self.simd.add_i64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for mask64x2<S> {
-    #[doc = "Compute the logical OR of two masks."]
+impl<S: Simd> core::ops::AddAssign<i64> for i64x2<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask64x2(*self, rhs);
+    fn add_assign(&mut self, rhs: i64) {
+        *self = self.simd.add_i64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor for mask64x2<S> {
+impl<S: Simd> core::ops::Add<i64x2<S>> for i64 {
+    type Output = i64x2<S>;
+    #[inline(always)]
+    fn add(self, rhs: i64x2<S>) -> Self::Output {
+        rhs.simd.add_i64x2(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for i64x2<S> {
     type Output = Self;
-    #[doc = "Compute the logical XOR of two masks."]
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask64x2(self, rhs)
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for mask64x2<S> {
-    #[doc = "Compute the logical XOR of two masks."]
+impl<S: Simd> core::ops::SubAssign for i64x2<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask64x2(*self, rhs);
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_i64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Not for mask64x2<S> {
+impl<S: Simd> core::ops::Sub<i64> for i64x2<S> {
     type Output = Self;
-    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_mask64x2(self)
+    fn sub(self, rhs: i64) -> Self::Output {
+        self.simd.sub_i64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Neg for f32x8<S> {
+impl<S: Simd> core::ops::SubAssign<i64> for i64x2<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: i64) {
+        *self = self.simd.sub_i64x2(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<i64x2<S>> for i64 {
+    type Output = i64x2<S>;
+    #[inline(always)]
+    fn sub(self, rhs: i64x2<S>) -> Self::Output {
+        rhs.simd.sub_i64x2(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for i64x2<S> {
     type Output = Self;
-    #[doc = "Negate each element of the vector."]
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn neg(self) -> Self::Output {
-        self.simd.neg_f32x8(self)
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Add for f32x8<S> {
+impl<S: Simd> core::ops::MulAssign for i64x2<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_i64x2(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<i64> for i64x2<S> {
     type Output = Self;
-    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_f32x8(self, rhs)
+    fn mul(self, rhs: i64) -> Self::Output {
+        self.simd.mul_i64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign for f32x8<S> {
-    #[doc = "Add two vectors element-wise."]
+impl<S: Simd> core::ops::MulAssign<i64> for i64x2<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_f32x8(*self, rhs);
+    fn mul_assign(&mut self, rhs: i64) {
+        *self = self.simd.mul_i64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<f32> for f32x8<S> {
+impl<S: Simd> core::ops::Mul<i64x2<S>> for i64 {
+    type Output = i64x2<S>;
+    #[inline(always)]
+    fn mul(self, rhs: i64x2<S>) -> Self::Output {
+        rhs.simd.mul_i64x2(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for i64x2<S> {
     type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
-    fn add(self, rhs: f32) -> Self::Output {
-        self.simd.add_f32x8(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign<f32> for f32x8<S> {
+impl<S: Simd> core::ops::BitAndAssign for i64x2<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
-    fn add_assign(&mut self, rhs: f32) {
-        *self = self.simd.add_f32x8(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_i64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<f32x8<S>> for f32 {
-    type Output = f32x8<S>;
+impl<S: Simd> core::ops::BitAnd<i64> for i64x2<S> {
+    type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: f32x8<S>) -> Self::Output {
-        rhs.simd.add_f32x8(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: i64) -> Self::Output {
+        self.simd.and_i64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Sub for f32x8<S> {
+impl<S: Simd> core::ops::BitAndAssign<i64> for i64x2<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: i64) {
+        *self = self.simd.and_i64x2(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<i64x2<S>> for i64 {
+    type Output = i64x2<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: i64x2<S>) -> Self::Output {
+        rhs.simd.and_i64x2(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for i64x2<S> {
     type Output = Self;
-    #[doc = "Subtract two vectors element-wise."]
+    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_f32x8(self, rhs)
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for f32x8<S> {
-    #[doc = "Subtract two vectors element-wise."]
+impl<S: Simd> core::ops::BitOrAssign for i64x2<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_f32x8(*self, rhs);
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_i64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<f32> for f32x8<S> {
+impl<S: Simd> core::ops::BitOr<i64> for i64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: f32) -> Self::Output {
-        self.simd.sub_f32x8(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: i64) -> Self::Output {
+        self.simd.or_i64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<f32> for f32x8<S> {
+impl<S: Simd> core::ops::BitOrAssign<i64> for i64x2<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: f32) {
-        *self = self.simd.sub_f32x8(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: i64) {
+        *self = self.simd.or_i64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<f32x8<S>> for f32 {
-    type Output = f32x8<S>;
+impl<S: Simd> core::ops::BitOr<i64x2<S>> for i64 {
+    type Output = i64x2<S>;
     #[inline(always)]
-    fn sub(self, rhs: f32x8<S>) -> Self::Output {
-        rhs.simd.sub_f32x8(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: i64x2<S>) -> Self::Output {
+        rhs.simd.or_i64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for f32x8<S> {
+impl<S: Simd> core::ops::BitXor for i64x2<S> {
     type Output = Self;
-    #[doc = "Multiply two vectors element-wise."]
+    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_f32x8(self, rhs)
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for f32x8<S> {
-    #[doc = "Multiply two vectors element-wise."]
+impl<S: Simd> core::ops::BitXorAssign for i64x2<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_f32x8(*self, rhs);
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_i64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<f32> for f32x8<S> {
+impl<S: Simd> core::ops::BitXor<i64> for i64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: f32) -> Self::Output {
-        self.simd.mul_f32x8(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: i64) -> Self::Output {
+        self.simd.xor_i64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<f32> for f32x8<S> {
+impl<S: Simd> core::ops::BitXorAssign<i64> for i64x2<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: f32) {
-        *self = self.simd.mul_f32x8(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: i64) {
+        *self = self.simd.xor_i64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<f32x8<S>> for f32 {
-    type Output = f32x8<S>;
+impl<S: Simd> core::ops::BitXor<i64x2<S>> for i64 {
+    type Output = i64x2<S>;
     #[inline(always)]
-    fn mul(self, rhs: f32x8<S>) -> Self::Output {
-        rhs.simd.mul_f32x8(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: i64x2<S>) -> Self::Output {
+        rhs.simd.xor_i64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Div for f32x8<S> {
+impl<S: Simd> core::ops::Not for i64x2<S> {
     type Output = Self;
-    #[doc = "Divide two vectors element-wise."]
+    #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
-    fn div(self, rhs: Self) -> Self::Output {
-        self.simd.div_f32x8(self, rhs)
+    fn not(self) -> Self::Output {
+        self.simd.not_i64x2(self)
     }
 }
-impl<S: Simd> core::ops::DivAssign for f32x8<S> {
-    #[doc = "Divide two vectors element-wise."]
+impl<S: Simd> core::ops::Shl<u32> for i64x2<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
-    fn div_assign(&mut self, rhs: Self) {
-        *self = self.simd.div_f32x8(*self, rhs);
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Div<f32> for f32x8<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for i64x2<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_i64x2(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for i64x2<S> {
     type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn div(self, rhs: f32) -> Self::Output {
-        self.simd.div_f32x8(self, rhs.simd_into(self.simd))
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::DivAssign<f32> for f32x8<S> {
+impl<S: Simd> core::ops::ShlAssign for i64x2<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn div_assign(&mut self, rhs: f32) {
-        *self = self.simd.div_f32x8(*self, rhs.simd_into(self.simd));
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_i64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Div<f32x8<S>> for f32 {
-    type Output = f32x8<S>;
+impl<S: Simd> core::ops::Shr<u32> for i64x2<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
-    fn div(self, rhs: f32x8<S>) -> Self::Output {
-        rhs.simd.div_f32x8(self.simd_into(rhs.simd), rhs)
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Neg for i8x32<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for i64x2<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_i64x2(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for i64x2<S> {
     type Output = Self;
-    #[doc = "Negate each element of the vector, wrapping on overflow."]
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn neg(self) -> Self::Output {
-        self.simd.neg_i8x32(self)
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_i64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Add for i8x32<S> {
+impl<S: Simd> core::ops::ShrAssign for i64x2<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_i64x2(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add for u64x2<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_i8x32(self, rhs)
+        self.simd.add_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for i8x32<S> {
+impl<S: Simd> core::ops::AddAssign for u64x2<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_i8x32(*self, rhs);
+        *self = self.simd.add_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<i8> for i8x32<S> {
+impl<S: Simd> core::ops::Add<u64> for u64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: i8) -> Self::Output {
-        self.simd.add_i8x32(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: u64) -> Self::Output {
+        self.simd.add_u64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<i8> for i8x32<S> {
+impl<S: Simd> core::ops::AddAssign<u64> for u64x2<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: i8) {
-        *self = self.simd.add_i8x32(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: u64) {
+        *self = self.simd.add_u64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<i8x32<S>> for i8 {
-    type Output = i8x32<S>;
+impl<S: Simd> core::ops::Add<u64x2<S>> for u64 {
+    type Output = u64x2<S>;
     #[inline(always)]
-    fn add(self, rhs: i8x32<S>) -> Self::Output {
-        rhs.simd.add_i8x32(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: u64x2<S>) -> Self::Output {
+        rhs.simd.add_u64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for i8x32<S> {
+impl<S: Simd> core::ops::Sub for u64x2<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_i8x32(self, rhs)
+        self.simd.sub_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for i8x32<S> {
+impl<S: Simd> core::ops::SubAssign for u64x2<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_i8x32(*self, rhs);
+        *self = self.simd.sub_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<i8> for i8x32<S> {
+impl<S: Simd> core::ops::Sub<u64> for u64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: i8) -> Self::Output {
-        self.simd.sub_i8x32(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: u64) -> Self::Output {
+        self.simd.sub_u64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<i8> for i8x32<S> {
+impl<S: Simd> core::ops::SubAssign<u64> for u64x2<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: i8) {
-        *self = self.simd.sub_i8x32(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: u64) {
+        *self = self.simd.sub_u64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<i8x32<S>> for i8 {
-    type Output = i8x32<S>;
+impl<S: Simd> core::ops::Sub<u64x2<S>> for u64 {
+    type Output = u64x2<S>;
     #[inline(always)]
-    fn sub(self, rhs: i8x32<S>) -> Self::Output {
-        rhs.simd.sub_i8x32(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: u64x2<S>) -> Self::Output {
+        rhs.simd.sub_u64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for i8x32<S> {
+impl<S: Simd> core::ops::Mul for u64x2<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_i8x32(self, rhs)
+        self.simd.mul_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for i8x32<S> {
+impl<S: Simd> core::ops::MulAssign for u64x2<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_i8x32(*self, rhs);
+        *self = self.simd.mul_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<i8> for i8x32<S> {
+impl<S: Simd> core::ops::Mul<u64> for u64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: i8) -> Self::Output {
-        self.simd.mul_i8x32(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: u64) -> Self::Output {
+        self.simd.mul_u64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<i8> for i8x32<S> {
+impl<S: Simd> core::ops::MulAssign<u64> for u64x2<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: i8) {
-        *self = self.simd.mul_i8x32(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: u64) {
+        *self = self.simd.mul_u64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<i8x32<S>> for i8 {
-    type Output = i8x32<S>;
+impl<S: Simd> core::ops::Mul<u64x2<S>> for u64 {
+    type Output = u64x2<S>;
     #[inline(always)]
-    fn mul(self, rhs: i8x32<S>) -> Self::Output {
-        rhs.simd.mul_i8x32(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: u64x2<S>) -> Self::Output {
+        rhs.simd.mul_u64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for i8x32<S> {
+impl<S: Simd> core::ops::BitAnd for u64x2<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_i8x32(self, rhs)
+        self.simd.and_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for i8x32<S> {
+impl<S: Simd> core::ops::BitAndAssign for u64x2<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_i8x32(*self, rhs);
+        *self = self.simd.and_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<i8> for i8x32<S> {
+impl<S: Simd> core::ops::BitAnd<u64> for u64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: i8) -> Self::Output {
-        self.simd.and_i8x32(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: u64) -> Self::Output {
+        self.simd.and_u64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<i8> for i8x32<S> {
+impl<S: Simd> core::ops::BitAndAssign<u64> for u64x2<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: i8) {
-        *self = self.simd.and_i8x32(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: u64) {
+        *self = self.simd.and_u64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<i8x32<S>> for i8 {
-    type Output = i8x32<S>;
+impl<S: Simd> core::ops::BitAnd<u64x2<S>> for u64 {
+    type Output = u64x2<S>;
     #[inline(always)]
-    fn bitand(self, rhs: i8x32<S>) -> Self::Output {
-        rhs.simd.and_i8x32(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: u64x2<S>) -> Self::Output {
+        rhs.simd.and_u64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for i8x32<S> {
+impl<S: Simd> core::ops::BitOr for u64x2<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_i8x32(self, rhs)
+        self.simd.or_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for i8x32<S> {
+impl<S: Simd> core::ops::BitOrAssign for u64x2<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_i8x32(*self, rhs);
+        *self = self.simd.or_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<i8> for i8x32<S> {
+impl<S: Simd> core::ops::BitOr<u64> for u64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: i8) -> Self::Output {
-        self.simd.or_i8x32(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: u64) -> Self::Output {
+        self.simd.or_u64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<i8> for i8x32<S> {
+impl<S: Simd> core::ops::BitOrAssign<u64> for u64x2<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: i8) {
-        *self = self.simd.or_i8x32(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: u64) {
+        *self = self.simd.or_u64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<i8x32<S>> for i8 {
-    type Output = i8x32<S>;
+impl<S: Simd> core::ops::BitOr<u64x2<S>> for u64 {
+    type Output = u64x2<S>;
     #[inline(always)]
-    fn bitor(self, rhs: i8x32<S>) -> Self::Output {
-        rhs.simd.or_i8x32(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: u64x2<S>) -> Self::Output {
+        rhs.simd.or_u64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for i8x32<S> {
+impl<S: Simd> core::ops::BitXor for u64x2<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_i8x32(self, rhs)
+        self.simd.xor_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for i8x32<S> {
+impl<S: Simd> core::ops::BitXorAssign for u64x2<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_i8x32(*self, rhs);
+        *self = self.simd.xor_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<i8> for i8x32<S> {
+impl<S: Simd> core::ops::BitXor<u64> for u64x2<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: i8) -> Self::Output {
-        self.simd.xor_i8x32(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: u64) -> Self::Output {
+        self.simd.xor_u64x2(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<i8> for i8x32<S> {
+impl<S: Simd> core::ops::BitXorAssign<u64> for u64x2<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: i8) {
-        *self = self.simd.xor_i8x32(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: u64) {
+        *self = self.simd.xor_u64x2(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<i8x32<S>> for i8 {
-    type Output = i8x32<S>;
+impl<S: Simd> core::ops::BitXor<u64x2<S>> for u64 {
+    type Output = u64x2<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: i8x32<S>) -> Self::Output {
-        rhs.simd.xor_i8x32(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: u64x2<S>) -> Self::Output {
+        rhs.simd.xor_u64x2(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for i8x32<S> {
+impl<S: Simd> core::ops::Not for u64x2<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_i8x32(self)
+        self.simd.not_u64x2(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for i8x32<S> {
+impl<S: Simd> core::ops::Shl<u32> for u64x2<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_i8x32(self, rhs)
+        self.simd.shl_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for i8x32<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for u64x2<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_i8x32(*self, rhs);
+        *self = self.simd.shl_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for i8x32<S> {
+impl<S: Simd> core::ops::Shl for u64x2<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_i8x32(self, rhs)
+        self.simd.shlv_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for i8x32<S> {
+impl<S: Simd> core::ops::ShlAssign for u64x2<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_i8x32(*self, rhs);
+        *self = self.simd.shlv_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for i8x32<S> {
+impl<S: Simd> core::ops::Shr<u32> for u64x2<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_i8x32(self, rhs)
+        self.simd.shr_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for i8x32<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for u64x2<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_i8x32(*self, rhs);
+        *self = self.simd.shr_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for i8x32<S> {
+impl<S: Simd> core::ops::Shr for u64x2<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_i8x32(self, rhs)
+        self.simd.shrv_u64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for i8x32<S> {
+impl<S: Simd> core::ops::ShrAssign for u64x2<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_i8x32(*self, rhs);
+        *self = self.simd.shrv_u64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add for u8x32<S> {
+impl<S: Simd> core::ops::BitAnd for mask64x2<S> {
     type Output = Self;
-    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
-    fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_u8x32(self, rhs)
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_mask64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for u8x32<S> {
-    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+impl<S: Simd> core::ops::BitAndAssign for mask64x2<S> {
+    #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
-    fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_u8x32(*self, rhs);
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_mask64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<u8> for u8x32<S> {
+impl<S: Simd> core::ops::BitOr for mask64x2<S> {
     type Output = Self;
+    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn add(self, rhs: u8) -> Self::Output {
-        self.simd.add_u8x32(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::AddAssign<u8> for u8x32<S> {
-    #[inline(always)]
-    fn add_assign(&mut self, rhs: u8) {
-        *self = self.simd.add_u8x32(*self, rhs.simd_into(self.simd));
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_mask64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Add<u8x32<S>> for u8 {
-    type Output = u8x32<S>;
+impl<S: Simd> core::ops::BitOrAssign for mask64x2<S> {
+    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn add(self, rhs: u8x32<S>) -> Self::Output {
-        rhs.simd.add_u8x32(self.simd_into(rhs.simd), rhs)
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_mask64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub for u8x32<S> {
+impl<S: Simd> core::ops::BitXor for mask64x2<S> {
     type Output = Self;
-    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_u8x32(self, rhs)
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_mask64x2(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for u8x32<S> {
-    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+impl<S: Simd> core::ops::BitXorAssign for mask64x2<S> {
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_u8x32(*self, rhs);
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_mask64x2(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<u8> for u8x32<S> {
+impl<S: Simd> core::ops::Not for mask64x2<S> {
     type Output = Self;
+    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
-    fn sub(self, rhs: u8) -> Self::Output {
-        self.simd.sub_u8x32(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::SubAssign<u8> for u8x32<S> {
-    #[inline(always)]
-    fn sub_assign(&mut self, rhs: u8) {
-        *self = self.simd.sub_u8x32(*self, rhs.simd_into(self.simd));
+    fn not(self) -> Self::Output {
+        self.simd.not_mask64x2(self)
     }
 }
-impl<S: Simd> core::ops::Sub<u8x32<S>> for u8 {
-    type Output = u8x32<S>;
+impl<S: Simd> core::ops::Neg for f32x8<S> {
+    type Output = Self;
+    #[doc = "Negate each element of the vector."]
     #[inline(always)]
-    fn sub(self, rhs: u8x32<S>) -> Self::Output {
-        rhs.simd.sub_u8x32(self.simd_into(rhs.simd), rhs)
+    fn neg(self) -> Self::Output {
+        self.simd.neg_f32x8(self)
     }
 }
-impl<S: Simd> core::ops::Mul for u8x32<S> {
+impl<S: Simd> core::ops::Add for f32x8<S> {
     type Output = Self;
-    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_u8x32(self, rhs)
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_f32x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for u8x32<S> {
-    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+impl<S: Simd> core::ops::AddAssign for f32x8<S> {
+    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_u8x32(*self, rhs);
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_f32x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<u8> for u8x32<S> {
+impl<S: Simd> core::ops::Add<f32> for f32x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: u8) -> Self::Output {
-        self.simd.mul_u8x32(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: f32) -> Self::Output {
+        self.simd.add_f32x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<u8> for u8x32<S> {
+impl<S: Simd> core::ops::AddAssign<f32> for f32x8<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: u8) {
-        *self = self.simd.mul_u8x32(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: f32) {
+        *self = self.simd.add_f32x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<u8x32<S>> for u8 {
-    type Output = u8x32<S>;
+impl<S: Simd> core::ops::Add<f32x8<S>> for f32 {
+    type Output = f32x8<S>;
     #[inline(always)]
-    fn mul(self, rhs: u8x32<S>) -> Self::Output {
-        rhs.simd.mul_u8x32(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: f32x8<S>) -> Self::Output {
+        rhs.simd.add_f32x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for u8x32<S> {
+impl<S: Simd> core::ops::Sub for f32x8<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise AND of two vectors."]
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_u8x32(self, rhs)
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_f32x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for u8x32<S> {
-    #[doc = "Compute the bitwise AND of two vectors."]
+impl<S: Simd> core::ops::SubAssign for f32x8<S> {
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_u8x32(*self, rhs);
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_f32x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<u8> for u8x32<S> {
+impl<S: Simd> core::ops::Sub<f32> for f32x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: u8) -> Self::Output {
-        self.simd.and_u8x32(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: f32) -> Self::Output {
+        self.simd.sub_f32x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<u8> for u8x32<S> {
+impl<S: Simd> core::ops::SubAssign<f32> for f32x8<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: u8) {
-        *self = self.simd.and_u8x32(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: f32) {
+        *self = self.simd.sub_f32x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<u8x32<S>> for u8 {
-    type Output = u8x32<S>;
+impl<S: Simd> core::ops::Sub<f32x8<S>> for f32 {
+    type Output = f32x8<S>;
     #[inline(always)]
-    fn bitand(self, rhs: u8x32<S>) -> Self::Output {
-        rhs.simd.and_u8x32(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: f32x8<S>) -> Self::Output {
+        rhs.simd.sub_f32x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for u8x32<S> {
+impl<S: Simd> core::ops::Mul for f32x8<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise OR of two vectors."]
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_u8x32(self, rhs)
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_f32x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for u8x32<S> {
-    #[doc = "Compute the bitwise OR of two vectors."]
+impl<S: Simd> core::ops::MulAssign for f32x8<S> {
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_u8x32(*self, rhs);
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_f32x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<u8> for u8x32<S> {
+impl<S: Simd> core::ops::Mul<f32> for f32x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: u8) -> Self::Output {
-        self.simd.or_u8x32(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: f32) -> Self::Output {
+        self.simd.mul_f32x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<u8> for u8x32<S> {
+impl<S: Simd> core::ops::MulAssign<f32> for f32x8<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: u8) {
-        *self = self.simd.or_u8x32(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: f32) {
+        *self = self.simd.mul_f32x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<u8x32<S>> for u8 {
-    type Output = u8x32<S>;
+impl<S: Simd> core::ops::Mul<f32x8<S>> for f32 {
+    type Output = f32x8<S>;
     #[inline(always)]
-    fn bitor(self, rhs: u8x32<S>) -> Self::Output {
-        rhs.simd.or_u8x32(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: f32x8<S>) -> Self::Output {
+        rhs.simd.mul_f32x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for u8x32<S> {
+impl<S: Simd> core::ops::Div for f32x8<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_u8x32(self, rhs)
+    fn div(self, rhs: Self) -> Self::Output {
+        self.simd.div_f32x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for u8x32<S> {
-    #[doc = "Compute the bitwise XOR of two vectors."]
+impl<S: Simd> core::ops::DivAssign for f32x8<S> {
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_u8x32(*self, rhs);
+    fn div_assign(&mut self, rhs: Self) {
+        *self = self.simd.div_f32x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<u8> for u8x32<S> {
+impl<S: Simd> core::ops::Div<f32> for f32x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: u8) -> Self::Output {
-        self.simd.xor_u8x32(self, rhs.simd_into(self.simd))
+    fn div(self, rhs: f32) -> Self::Output {
+        self.simd.div_f32x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<u8> for u8x32<S> {
+impl<S: Simd> core::ops::DivAssign<f32> for f32x8<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: u8) {
-        *self = self.simd.xor_u8x32(*self, rhs.simd_into(self.simd));
+    fn div_assign(&mut self, rhs: f32) {
+        *self = self.simd.div_f32x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<u8x32<S>> for u8 {
-    type Output = u8x32<S>;
+impl<S: Simd> core::ops::Div<f32x8<S>> for f32 {
+    type Output = f32x8<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: u8x32<S>) -> Self::Output {
-        rhs.simd.xor_u8x32(self.simd_into(rhs.simd), rhs)
+    fn div(self, rhs: f32x8<S>) -> Self::Output {
+        rhs.simd.div_f32x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for u8x32<S> {
+impl<S: Simd> core::ops::Neg for i8x32<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise NOT of the vector."]
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
+    #[inline(always)]
+    fn neg(self) -> Self::Output {
+        self.simd.neg_i8x32(self)
+    }
+}
+impl<S: Simd> core::ops::Add for i8x32<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::AddAssign for i8x32<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<i8> for i8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: i8) -> Self::Output {
+        self.simd.add_i8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::AddAssign<i8> for i8x32<S> {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: i8) {
+        *self = self.simd.add_i8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Add<i8x32<S>> for i8 {
+    type Output = i8x32<S>;
+    #[inline(always)]
+    fn add(self, rhs: i8x32<S>) -> Self::Output {
+        rhs.simd.add_i8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for i8x32<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::SubAssign for i8x32<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Sub<i8> for i8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: i8) -> Self::Output {
+        self.simd.sub_i8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::SubAssign<i8> for i8x32<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: i8) {
+        *self = self.simd.sub_i8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<i8x32<S>> for i8 {
+    type Output = i8x32<S>;
+    #[inline(always)]
+    fn sub(self, rhs: i8x32<S>) -> Self::Output {
+        rhs.simd.sub_i8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for i8x32<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::MulAssign for i8x32<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<i8> for i8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: i8) -> Self::Output {
+        self.simd.mul_i8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::MulAssign<i8> for i8x32<S> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: i8) {
+        *self = self.simd.mul_i8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Mul<i8x32<S>> for i8 {
+    type Output = i8x32<S>;
+    #[inline(always)]
+    fn mul(self, rhs: i8x32<S>) -> Self::Output {
+        rhs.simd.mul_i8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for i8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for i8x32<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd<i8> for i8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitand(self, rhs: i8) -> Self::Output {
+        self.simd.and_i8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign<i8> for i8x32<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: i8) {
+        *self = self.simd.and_i8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<i8x32<S>> for i8 {
+    type Output = i8x32<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: i8x32<S>) -> Self::Output {
+        rhs.simd.and_i8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for i8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for i8x32<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr<i8> for i8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitor(self, rhs: i8) -> Self::Output {
+        self.simd.or_i8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign<i8> for i8x32<S> {
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: i8) {
+        *self = self.simd.or_i8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitOr<i8x32<S>> for i8 {
+    type Output = i8x32<S>;
+    #[inline(always)]
+    fn bitor(self, rhs: i8x32<S>) -> Self::Output {
+        rhs.simd.or_i8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXor for i8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for i8x32<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor<i8> for i8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitxor(self, rhs: i8) -> Self::Output {
+        self.simd.xor_i8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign<i8> for i8x32<S> {
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: i8) {
+        *self = self.simd.xor_i8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitXor<i8x32<S>> for i8 {
+    type Output = i8x32<S>;
+    #[inline(always)]
+    fn bitxor(self, rhs: i8x32<S>) -> Self::Output {
+        rhs.simd.xor_i8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Not for i8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_i8x32(self)
+    }
+}
+impl<S: Simd> core::ops::Shl<u32> for i8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    #[inline(always)]
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign<u32> for i8x32<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for i8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign for i8x32<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr<u32> for i8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    #[inline(always)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign<u32> for i8x32<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for i8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_i8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign for i8x32<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_i8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add for u8x32<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::AddAssign for u8x32<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<u8> for u8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: u8) -> Self::Output {
+        self.simd.add_u8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::AddAssign<u8> for u8x32<S> {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: u8) {
+        *self = self.simd.add_u8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Add<u8x32<S>> for u8 {
+    type Output = u8x32<S>;
+    #[inline(always)]
+    fn add(self, rhs: u8x32<S>) -> Self::Output {
+        rhs.simd.add_u8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for u8x32<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::SubAssign for u8x32<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Sub<u8> for u8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: u8) -> Self::Output {
+        self.simd.sub_u8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::SubAssign<u8> for u8x32<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: u8) {
+        *self = self.simd.sub_u8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<u8x32<S>> for u8 {
+    type Output = u8x32<S>;
+    #[inline(always)]
+    fn sub(self, rhs: u8x32<S>) -> Self::Output {
+        rhs.simd.sub_u8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for u8x32<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::MulAssign for u8x32<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<u8> for u8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: u8) -> Self::Output {
+        self.simd.mul_u8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::MulAssign<u8> for u8x32<S> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: u8) {
+        *self = self.simd.mul_u8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Mul<u8x32<S>> for u8 {
+    type Output = u8x32<S>;
+    #[inline(always)]
+    fn mul(self, rhs: u8x32<S>) -> Self::Output {
+        rhs.simd.mul_u8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for u8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for u8x32<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd<u8> for u8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitand(self, rhs: u8) -> Self::Output {
+        self.simd.and_u8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign<u8> for u8x32<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: u8) {
+        *self = self.simd.and_u8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<u8x32<S>> for u8 {
+    type Output = u8x32<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: u8x32<S>) -> Self::Output {
+        rhs.simd.and_u8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for u8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for u8x32<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr<u8> for u8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitor(self, rhs: u8) -> Self::Output {
+        self.simd.or_u8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign<u8> for u8x32<S> {
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: u8) {
+        *self = self.simd.or_u8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitOr<u8x32<S>> for u8 {
+    type Output = u8x32<S>;
+    #[inline(always)]
+    fn bitor(self, rhs: u8x32<S>) -> Self::Output {
+        rhs.simd.or_u8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXor for u8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for u8x32<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor<u8> for u8x32<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitxor(self, rhs: u8) -> Self::Output {
+        self.simd.xor_u8x32(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign<u8> for u8x32<S> {
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: u8) {
+        *self = self.simd.xor_u8x32(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitXor<u8x32<S>> for u8 {
+    type Output = u8x32<S>;
+    #[inline(always)]
+    fn bitxor(self, rhs: u8x32<S>) -> Self::Output {
+        rhs.simd.xor_u8x32(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Not for u8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_u8x32(self)
+    }
+}
+impl<S: Simd> core::ops::Shl<u32> for u8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    #[inline(always)]
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign<u32> for u8x32<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for u8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign for u8x32<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr<u32> for u8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    #[inline(always)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign<u32> for u8x32<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for u8x32<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_u8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign for u8x32<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_u8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd for mask8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the logical AND of two masks."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_mask8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for mask8x32<S> {
+    #[doc = "Compute the logical AND of two masks."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_mask8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr for mask8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the logical OR of two masks."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_mask8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for mask8x32<S> {
+    #[doc = "Compute the logical OR of two masks."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_mask8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor for mask8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the logical XOR of two masks."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_mask8x32(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for mask8x32<S> {
+    #[doc = "Compute the logical XOR of two masks."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_mask8x32(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Not for mask8x32<S> {
+    type Output = Self;
+    #[doc = "Compute the logical NOT of the mask."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_mask8x32(self)
+    }
+}
+impl<S: Simd> core::ops::Neg for i16x16<S> {
+    type Output = Self;
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
+    #[inline(always)]
+    fn neg(self) -> Self::Output {
+        self.simd.neg_i16x16(self)
+    }
+}
+impl<S: Simd> core::ops::Add for i16x16<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::AddAssign for i16x16<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<i16> for i16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: i16) -> Self::Output {
+        self.simd.add_i16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::AddAssign<i16> for i16x16<S> {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: i16) {
+        *self = self.simd.add_i16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Add<i16x16<S>> for i16 {
+    type Output = i16x16<S>;
+    #[inline(always)]
+    fn add(self, rhs: i16x16<S>) -> Self::Output {
+        rhs.simd.add_i16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for i16x16<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::SubAssign for i16x16<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Sub<i16> for i16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: i16) -> Self::Output {
+        self.simd.sub_i16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::SubAssign<i16> for i16x16<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: i16) {
+        *self = self.simd.sub_i16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<i16x16<S>> for i16 {
+    type Output = i16x16<S>;
+    #[inline(always)]
+    fn sub(self, rhs: i16x16<S>) -> Self::Output {
+        rhs.simd.sub_i16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for i16x16<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::MulAssign for i16x16<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<i16> for i16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: i16) -> Self::Output {
+        self.simd.mul_i16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::MulAssign<i16> for i16x16<S> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: i16) {
+        *self = self.simd.mul_i16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Mul<i16x16<S>> for i16 {
+    type Output = i16x16<S>;
+    #[inline(always)]
+    fn mul(self, rhs: i16x16<S>) -> Self::Output {
+        rhs.simd.mul_i16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for i16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for i16x16<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd<i16> for i16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitand(self, rhs: i16) -> Self::Output {
+        self.simd.and_i16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign<i16> for i16x16<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: i16) {
+        *self = self.simd.and_i16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<i16x16<S>> for i16 {
+    type Output = i16x16<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: i16x16<S>) -> Self::Output {
+        rhs.simd.and_i16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for i16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for i16x16<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr<i16> for i16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitor(self, rhs: i16) -> Self::Output {
+        self.simd.or_i16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign<i16> for i16x16<S> {
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: i16) {
+        *self = self.simd.or_i16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitOr<i16x16<S>> for i16 {
+    type Output = i16x16<S>;
+    #[inline(always)]
+    fn bitor(self, rhs: i16x16<S>) -> Self::Output {
+        rhs.simd.or_i16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXor for i16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for i16x16<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor<i16> for i16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitxor(self, rhs: i16) -> Self::Output {
+        self.simd.xor_i16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign<i16> for i16x16<S> {
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: i16) {
+        *self = self.simd.xor_i16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitXor<i16x16<S>> for i16 {
+    type Output = i16x16<S>;
+    #[inline(always)]
+    fn bitxor(self, rhs: i16x16<S>) -> Self::Output {
+        rhs.simd.xor_i16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Not for i16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_i16x16(self)
+    }
+}
+impl<S: Simd> core::ops::Shl<u32> for i16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    #[inline(always)]
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign<u32> for i16x16<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for i16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign for i16x16<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr<u32> for i16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    #[inline(always)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign<u32> for i16x16<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for i16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_i16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign for i16x16<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_i16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add for u16x16<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::AddAssign for u16x16<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<u16> for u16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: u16) -> Self::Output {
+        self.simd.add_u16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::AddAssign<u16> for u16x16<S> {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: u16) {
+        *self = self.simd.add_u16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Add<u16x16<S>> for u16 {
+    type Output = u16x16<S>;
+    #[inline(always)]
+    fn add(self, rhs: u16x16<S>) -> Self::Output {
+        rhs.simd.add_u16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for u16x16<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::SubAssign for u16x16<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Sub<u16> for u16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: u16) -> Self::Output {
+        self.simd.sub_u16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::SubAssign<u16> for u16x16<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: u16) {
+        *self = self.simd.sub_u16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<u16x16<S>> for u16 {
+    type Output = u16x16<S>;
+    #[inline(always)]
+    fn sub(self, rhs: u16x16<S>) -> Self::Output {
+        rhs.simd.sub_u16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for u16x16<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::MulAssign for u16x16<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<u16> for u16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: u16) -> Self::Output {
+        self.simd.mul_u16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::MulAssign<u16> for u16x16<S> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: u16) {
+        *self = self.simd.mul_u16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Mul<u16x16<S>> for u16 {
+    type Output = u16x16<S>;
+    #[inline(always)]
+    fn mul(self, rhs: u16x16<S>) -> Self::Output {
+        rhs.simd.mul_u16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for u16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for u16x16<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd<u16> for u16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitand(self, rhs: u16) -> Self::Output {
+        self.simd.and_u16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign<u16> for u16x16<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: u16) {
+        *self = self.simd.and_u16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<u16x16<S>> for u16 {
+    type Output = u16x16<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: u16x16<S>) -> Self::Output {
+        rhs.simd.and_u16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for u16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for u16x16<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr<u16> for u16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitor(self, rhs: u16) -> Self::Output {
+        self.simd.or_u16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign<u16> for u16x16<S> {
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: u16) {
+        *self = self.simd.or_u16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitOr<u16x16<S>> for u16 {
+    type Output = u16x16<S>;
+    #[inline(always)]
+    fn bitor(self, rhs: u16x16<S>) -> Self::Output {
+        rhs.simd.or_u16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXor for u16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for u16x16<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor<u16> for u16x16<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitxor(self, rhs: u16) -> Self::Output {
+        self.simd.xor_u16x16(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign<u16> for u16x16<S> {
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: u16) {
+        *self = self.simd.xor_u16x16(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitXor<u16x16<S>> for u16 {
+    type Output = u16x16<S>;
+    #[inline(always)]
+    fn bitxor(self, rhs: u16x16<S>) -> Self::Output {
+        rhs.simd.xor_u16x16(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Not for u16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_u16x16(self)
+    }
+}
+impl<S: Simd> core::ops::Shl<u32> for u16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    #[inline(always)]
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign<u32> for u16x16<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for u16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign for u16x16<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr<u32> for u16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    #[inline(always)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign<u32> for u16x16<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for u16x16<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_u16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign for u16x16<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_u16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd for mask16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the logical AND of two masks."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_mask16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for mask16x16<S> {
+    #[doc = "Compute the logical AND of two masks."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_mask16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr for mask16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the logical OR of two masks."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_mask16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for mask16x16<S> {
+    #[doc = "Compute the logical OR of two masks."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_mask16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor for mask16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the logical XOR of two masks."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_mask16x16(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for mask16x16<S> {
+    #[doc = "Compute the logical XOR of two masks."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_mask16x16(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Not for mask16x16<S> {
+    type Output = Self;
+    #[doc = "Compute the logical NOT of the mask."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_mask16x16(self)
+    }
+}
+impl<S: Simd> core::ops::Neg for i32x8<S> {
+    type Output = Self;
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
+    #[inline(always)]
+    fn neg(self) -> Self::Output {
+        self.simd.neg_i32x8(self)
+    }
+}
+impl<S: Simd> core::ops::Add for i32x8<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::AddAssign for i32x8<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<i32> for i32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: i32) -> Self::Output {
+        self.simd.add_i32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::AddAssign<i32> for i32x8<S> {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: i32) {
+        *self = self.simd.add_i32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Add<i32x8<S>> for i32 {
+    type Output = i32x8<S>;
+    #[inline(always)]
+    fn add(self, rhs: i32x8<S>) -> Self::Output {
+        rhs.simd.add_i32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for i32x8<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::SubAssign for i32x8<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Sub<i32> for i32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: i32) -> Self::Output {
+        self.simd.sub_i32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::SubAssign<i32> for i32x8<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: i32) {
+        *self = self.simd.sub_i32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<i32x8<S>> for i32 {
+    type Output = i32x8<S>;
+    #[inline(always)]
+    fn sub(self, rhs: i32x8<S>) -> Self::Output {
+        rhs.simd.sub_i32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for i32x8<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::MulAssign for i32x8<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<i32> for i32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: i32) -> Self::Output {
+        self.simd.mul_i32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::MulAssign<i32> for i32x8<S> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: i32) {
+        *self = self.simd.mul_i32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Mul<i32x8<S>> for i32 {
+    type Output = i32x8<S>;
+    #[inline(always)]
+    fn mul(self, rhs: i32x8<S>) -> Self::Output {
+        rhs.simd.mul_i32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for i32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for i32x8<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd<i32> for i32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitand(self, rhs: i32) -> Self::Output {
+        self.simd.and_i32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign<i32> for i32x8<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: i32) {
+        *self = self.simd.and_i32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<i32x8<S>> for i32 {
+    type Output = i32x8<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: i32x8<S>) -> Self::Output {
+        rhs.simd.and_i32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for i32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for i32x8<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr<i32> for i32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitor(self, rhs: i32) -> Self::Output {
+        self.simd.or_i32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign<i32> for i32x8<S> {
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: i32) {
+        *self = self.simd.or_i32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitOr<i32x8<S>> for i32 {
+    type Output = i32x8<S>;
+    #[inline(always)]
+    fn bitor(self, rhs: i32x8<S>) -> Self::Output {
+        rhs.simd.or_i32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXor for i32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for i32x8<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor<i32> for i32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitxor(self, rhs: i32) -> Self::Output {
+        self.simd.xor_i32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign<i32> for i32x8<S> {
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: i32) {
+        *self = self.simd.xor_i32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitXor<i32x8<S>> for i32 {
+    type Output = i32x8<S>;
+    #[inline(always)]
+    fn bitxor(self, rhs: i32x8<S>) -> Self::Output {
+        rhs.simd.xor_i32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Not for i32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_i32x8(self)
+    }
+}
+impl<S: Simd> core::ops::Shl<u32> for i32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    #[inline(always)]
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign<u32> for i32x8<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for i32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign for i32x8<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr<u32> for i32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    #[inline(always)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign<u32> for i32x8<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for i32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_i32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign for i32x8<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_i32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add for u32x8<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::AddAssign for u32x8<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<u32> for u32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: u32) -> Self::Output {
+        self.simd.add_u32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::AddAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: u32) {
+        *self = self.simd.add_u32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Add<u32x8<S>> for u32 {
+    type Output = u32x8<S>;
+    #[inline(always)]
+    fn add(self, rhs: u32x8<S>) -> Self::Output {
+        rhs.simd.add_u32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for u32x8<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::SubAssign for u32x8<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Sub<u32> for u32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: u32) -> Self::Output {
+        self.simd.sub_u32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::SubAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: u32) {
+        *self = self.simd.sub_u32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<u32x8<S>> for u32 {
+    type Output = u32x8<S>;
+    #[inline(always)]
+    fn sub(self, rhs: u32x8<S>) -> Self::Output {
+        rhs.simd.sub_u32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for u32x8<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::MulAssign for u32x8<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<u32> for u32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: u32) -> Self::Output {
+        self.simd.mul_u32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::MulAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: u32) {
+        *self = self.simd.mul_u32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Mul<u32x8<S>> for u32 {
+    type Output = u32x8<S>;
+    #[inline(always)]
+    fn mul(self, rhs: u32x8<S>) -> Self::Output {
+        rhs.simd.mul_u32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for u32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for u32x8<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd<u32> for u32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitand(self, rhs: u32) -> Self::Output {
+        self.simd.and_u32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: u32) {
+        *self = self.simd.and_u32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<u32x8<S>> for u32 {
+    type Output = u32x8<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: u32x8<S>) -> Self::Output {
+        rhs.simd.and_u32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for u32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for u32x8<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr<u32> for u32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitor(self, rhs: u32) -> Self::Output {
+        self.simd.or_u32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: u32) {
+        *self = self.simd.or_u32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitOr<u32x8<S>> for u32 {
+    type Output = u32x8<S>;
+    #[inline(always)]
+    fn bitor(self, rhs: u32x8<S>) -> Self::Output {
+        rhs.simd.or_u32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXor for u32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for u32x8<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor<u32> for u32x8<S> {
+    type Output = Self;
+    #[inline(always)]
+    fn bitxor(self, rhs: u32) -> Self::Output {
+        self.simd.xor_u32x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: u32) {
+        *self = self.simd.xor_u32x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitXor<u32x8<S>> for u32 {
+    type Output = u32x8<S>;
+    #[inline(always)]
+    fn bitxor(self, rhs: u32x8<S>) -> Self::Output {
+        rhs.simd.xor_u32x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Not for u32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    #[inline(always)]
+    fn not(self) -> Self::Output {
+        self.simd.not_u32x8(self)
+    }
+}
+impl<S: Simd> core::ops::Shl<u32> for u32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    #[inline(always)]
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for u32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShlAssign for u32x8<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr<u32> for u32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    #[inline(always)]
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign<u32> for u32x8<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for u32x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_u32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign for u32x8<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_u32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitAnd for mask32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the logical AND of two masks."]
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_mask32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAndAssign for mask32x8<S> {
+    #[doc = "Compute the logical AND of two masks."]
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_mask32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitOr for mask32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the logical OR of two masks."]
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_mask32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOrAssign for mask32x8<S> {
+    #[doc = "Compute the logical OR of two masks."]
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_mask32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::BitXor for mask32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the logical XOR of two masks."]
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_mask32x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::BitXorAssign for mask32x8<S> {
+    #[doc = "Compute the logical XOR of two masks."]
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_mask32x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Not for mask32x8<S> {
+    type Output = Self;
+    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_u8x32(self)
+        self.simd.not_mask32x8(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for u8x32<S> {
+impl<S: Simd> core::ops::Neg for f64x4<S> {
     type Output = Self;
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    #[doc = "Negate each element of the vector."]
     #[inline(always)]
-    fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_u8x32(self, rhs)
+    fn neg(self) -> Self::Output {
+        self.simd.neg_f64x4(self)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for u8x32<S> {
+impl<S: Simd> core::ops::Add for f64x4<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_u8x32(*self, rhs);
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_f64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Shl for u8x32<S> {
+impl<S: Simd> core::ops::AddAssign for f64x4<S> {
+    #[doc = "Add two vectors element-wise."]
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_f64x4(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Add<f64> for f64x4<S> {
     type Output = Self;
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_u8x32(self, rhs)
+    fn add(self, rhs: f64) -> Self::Output {
+        self.simd.add_f64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::ShlAssign for u8x32<S> {
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::AddAssign<f64> for f64x4<S> {
     #[inline(always)]
-    fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_u8x32(*self, rhs);
+    fn add_assign(&mut self, rhs: f64) {
+        *self = self.simd.add_f64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for u8x32<S> {
+impl<S: Simd> core::ops::Add<f64x4<S>> for f64 {
+    type Output = f64x4<S>;
+    #[inline(always)]
+    fn add(self, rhs: f64x4<S>) -> Self::Output {
+        rhs.simd.add_f64x4(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for f64x4<S> {
     type Output = Self;
-    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_u8x32(self, rhs)
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_f64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for u8x32<S> {
+impl<S: Simd> core::ops::SubAssign for f64x4<S> {
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_u8x32(*self, rhs);
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_f64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for u8x32<S> {
+impl<S: Simd> core::ops::Sub<f64> for f64x4<S> {
     type Output = Self;
-    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_u8x32(self, rhs)
+    fn sub(self, rhs: f64) -> Self::Output {
+        self.simd.sub_f64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::ShrAssign for u8x32<S> {
-    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::SubAssign<f64> for f64x4<S> {
     #[inline(always)]
-    fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_u8x32(*self, rhs);
+    fn sub_assign(&mut self, rhs: f64) {
+        *self = self.simd.sub_f64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd for mask8x32<S> {
+impl<S: Simd> core::ops::Sub<f64x4<S>> for f64 {
+    type Output = f64x4<S>;
+    #[inline(always)]
+    fn sub(self, rhs: f64x4<S>) -> Self::Output {
+        rhs.simd.sub_f64x4(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for f64x4<S> {
     type Output = Self;
-    #[doc = "Compute the logical AND of two masks."]
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask8x32(self, rhs)
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_f64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for mask8x32<S> {
-    #[doc = "Compute the logical AND of two masks."]
+impl<S: Simd> core::ops::MulAssign for f64x4<S> {
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask8x32(*self, rhs);
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_f64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr for mask8x32<S> {
+impl<S: Simd> core::ops::Mul<f64> for f64x4<S> {
     type Output = Self;
-    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask8x32(self, rhs)
+    fn mul(self, rhs: f64) -> Self::Output {
+        self.simd.mul_f64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for mask8x32<S> {
-    #[doc = "Compute the logical OR of two masks."]
+impl<S: Simd> core::ops::MulAssign<f64> for f64x4<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask8x32(*self, rhs);
+    fn mul_assign(&mut self, rhs: f64) {
+        *self = self.simd.mul_f64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor for mask8x32<S> {
+impl<S: Simd> core::ops::Mul<f64x4<S>> for f64 {
+    type Output = f64x4<S>;
+    #[inline(always)]
+    fn mul(self, rhs: f64x4<S>) -> Self::Output {
+        rhs.simd.mul_f64x4(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Div for f64x4<S> {
     type Output = Self;
-    #[doc = "Compute the logical XOR of two masks."]
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask8x32(self, rhs)
+    fn div(self, rhs: Self) -> Self::Output {
+        self.simd.div_f64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for mask8x32<S> {
-    #[doc = "Compute the logical XOR of two masks."]
+impl<S: Simd> core::ops::DivAssign for f64x4<S> {
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask8x32(*self, rhs);
+    fn div_assign(&mut self, rhs: Self) {
+        *self = self.simd.div_f64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Not for mask8x32<S> {
+impl<S: Simd> core::ops::Div<f64> for f64x4<S> {
     type Output = Self;
-    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_mask8x32(self)
+    fn div(self, rhs: f64) -> Self::Output {
+        self.simd.div_f64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Neg for i16x16<S> {
+impl<S: Simd> core::ops::DivAssign<f64> for f64x4<S> {
+    #[inline(always)]
+    fn div_assign(&mut self, rhs: f64) {
+        *self = self.simd.div_f64x4(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Div<f64x4<S>> for f64 {
+    type Output = f64x4<S>;
+    #[inline(always)]
+    fn div(self, rhs: f64x4<S>) -> Self::Output {
+        rhs.simd.div_f64x4(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Neg for i64x4<S> {
     type Output = Self;
     #[doc = "Negate each element of the vector, wrapping on overflow."]
     #[inline(always)]
     fn neg(self) -> Self::Output {
-        self.simd.neg_i16x16(self)
+        self.simd.neg_i64x4(self)
     }
 }
-impl<S: Simd> core::ops::Add for i16x16<S> {
+impl<S: Simd> core::ops::Add for i64x4<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_i16x16(self, rhs)
+        self.simd.add_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for i16x16<S> {
+impl<S: Simd> core::ops::AddAssign for i64x4<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_i16x16(*self, rhs);
+        *self = self.simd.add_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<i16> for i16x16<S> {
+impl<S: Simd> core::ops::Add<i64> for i64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: i16) -> Self::Output {
-        self.simd.add_i16x16(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: i64) -> Self::Output {
+        self.simd.add_i64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<i16> for i16x16<S> {
+impl<S: Simd> core::ops::AddAssign<i64> for i64x4<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: i16) {
-        *self = self.simd.add_i16x16(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: i64) {
+        *self = self.simd.add_i64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<i16x16<S>> for i16 {
-    type Output = i16x16<S>;
+impl<S: Simd> core::ops::Add<i64x4<S>> for i64 {
+    type Output = i64x4<S>;
     #[inline(always)]
-    fn add(self, rhs: i16x16<S>) -> Self::Output {
-        rhs.simd.add_i16x16(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: i64x4<S>) -> Self::Output {
+        rhs.simd.add_i64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for i16x16<S> {
+impl<S: Simd> core::ops::Sub for i64x4<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_i16x16(self, rhs)
+        self.simd.sub_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for i16x16<S> {
+impl<S: Simd> core::ops::SubAssign for i64x4<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_i16x16(*self, rhs);
+        *self = self.simd.sub_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<i16> for i16x16<S> {
+impl<S: Simd> core::ops::Sub<i64> for i64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: i16) -> Self::Output {
-        self.simd.sub_i16x16(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: i64) -> Self::Output {
+        self.simd.sub_i64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<i16> for i16x16<S> {
+impl<S: Simd> core::ops::SubAssign<i64> for i64x4<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: i16) {
-        *self = self.simd.sub_i16x16(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: i64) {
+        *self = self.simd.sub_i64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<i16x16<S>> for i16 {
-    type Output = i16x16<S>;
+impl<S: Simd> core::ops::Sub<i64x4<S>> for i64 {
+    type Output = i64x4<S>;
     #[inline(always)]
-    fn sub(self, rhs: i16x16<S>) -> Self::Output {
-        rhs.simd.sub_i16x16(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: i64x4<S>) -> Self::Output {
+        rhs.simd.sub_i64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for i16x16<S> {
+impl<S: Simd> core::ops::Mul for i64x4<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_i16x16(self, rhs)
+        self.simd.mul_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for i16x16<S> {
+impl<S: Simd> core::ops::MulAssign for i64x4<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_i16x16(*self, rhs);
+        *self = self.simd.mul_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<i16> for i16x16<S> {
+impl<S: Simd> core::ops::Mul<i64> for i64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: i16) -> Self::Output {
-        self.simd.mul_i16x16(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: i64) -> Self::Output {
+        self.simd.mul_i64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<i16> for i16x16<S> {
+impl<S: Simd> core::ops::MulAssign<i64> for i64x4<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: i16) {
-        *self = self.simd.mul_i16x16(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: i64) {
+        *self = self.simd.mul_i64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<i16x16<S>> for i16 {
-    type Output = i16x16<S>;
+impl<S: Simd> core::ops::Mul<i64x4<S>> for i64 {
+    type Output = i64x4<S>;
     #[inline(always)]
-    fn mul(self, rhs: i16x16<S>) -> Self::Output {
-        rhs.simd.mul_i16x16(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: i64x4<S>) -> Self::Output {
+        rhs.simd.mul_i64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for i16x16<S> {
+impl<S: Simd> core::ops::BitAnd for i64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_i16x16(self, rhs)
+        self.simd.and_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for i16x16<S> {
+impl<S: Simd> core::ops::BitAndAssign for i64x4<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_i16x16(*self, rhs);
+        *self = self.simd.and_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<i16> for i16x16<S> {
+impl<S: Simd> core::ops::BitAnd<i64> for i64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: i16) -> Self::Output {
-        self.simd.and_i16x16(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: i64) -> Self::Output {
+        self.simd.and_i64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<i16> for i16x16<S> {
+impl<S: Simd> core::ops::BitAndAssign<i64> for i64x4<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: i16) {
-        *self = self.simd.and_i16x16(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: i64) {
+        *self = self.simd.and_i64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<i16x16<S>> for i16 {
-    type Output = i16x16<S>;
+impl<S: Simd> core::ops::BitAnd<i64x4<S>> for i64 {
+    type Output = i64x4<S>;
     #[inline(always)]
-    fn bitand(self, rhs: i16x16<S>) -> Self::Output {
-        rhs.simd.and_i16x16(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: i64x4<S>) -> Self::Output {
+        rhs.simd.and_i64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for i16x16<S> {
+impl<S: Simd> core::ops::BitOr for i64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_i16x16(self, rhs)
+        self.simd.or_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for i16x16<S> {
+impl<S: Simd> core::ops::BitOrAssign for i64x4<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_i16x16(*self, rhs);
+        *self = self.simd.or_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<i16> for i16x16<S> {
+impl<S: Simd> core::ops::BitOr<i64> for i64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: i16) -> Self::Output {
-        self.simd.or_i16x16(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: i64) -> Self::Output {
+        self.simd.or_i64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<i16> for i16x16<S> {
+impl<S: Simd> core::ops::BitOrAssign<i64> for i64x4<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: i16) {
-        *self = self.simd.or_i16x16(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: i64) {
+        *self = self.simd.or_i64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<i16x16<S>> for i16 {
-    type Output = i16x16<S>;
+impl<S: Simd> core::ops::BitOr<i64x4<S>> for i64 {
+    type Output = i64x4<S>;
     #[inline(always)]
-    fn bitor(self, rhs: i16x16<S>) -> Self::Output {
-        rhs.simd.or_i16x16(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: i64x4<S>) -> Self::Output {
+        rhs.simd.or_i64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for i16x16<S> {
+impl<S: Simd> core::ops::BitXor for i64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_i16x16(self, rhs)
+        self.simd.xor_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for i16x16<S> {
+impl<S: Simd> core::ops::BitXorAssign for i64x4<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_i16x16(*self, rhs);
+        *self = self.simd.xor_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<i16> for i16x16<S> {
+impl<S: Simd> core::ops::BitXor<i64> for i64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: i16) -> Self::Output {
-        self.simd.xor_i16x16(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: i64) -> Self::Output {
+        self.simd.xor_i64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<i16> for i16x16<S> {
+impl<S: Simd> core::ops::BitXorAssign<i64> for i64x4<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: i16) {
-        *self = self.simd.xor_i16x16(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: i64) {
+        *self = self.simd.xor_i64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<i16x16<S>> for i16 {
-    type Output = i16x16<S>;
+impl<S: Simd> core::ops::BitXor<i64x4<S>> for i64 {
+    type Output = i64x4<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: i16x16<S>) -> Self::Output {
-        rhs.simd.xor_i16x16(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: i64x4<S>) -> Self::Output {
+        rhs.simd.xor_i64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for i16x16<S> {
+impl<S: Simd> core::ops::Not for i64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_i16x16(self)
+        self.simd.not_i64x4(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for i16x16<S> {
+impl<S: Simd> core::ops::Shl<u32> for i64x4<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_i16x16(self, rhs)
+        self.simd.shl_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for i16x16<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for i64x4<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_i16x16(*self, rhs);
+        *self = self.simd.shl_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for i16x16<S> {
+impl<S: Simd> core::ops::Shl for i64x4<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_i16x16(self, rhs)
+        self.simd.shlv_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for i16x16<S> {
+impl<S: Simd> core::ops::ShlAssign for i64x4<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_i16x16(*self, rhs);
+        *self = self.simd.shlv_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for i16x16<S> {
+impl<S: Simd> core::ops::Shr<u32> for i64x4<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_i16x16(self, rhs)
+        self.simd.shr_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for i16x16<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for i64x4<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_i16x16(*self, rhs);
+        *self = self.simd.shr_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for i16x16<S> {
+impl<S: Simd> core::ops::Shr for i64x4<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_i16x16(self, rhs)
+        self.simd.shrv_i64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for i16x16<S> {
+impl<S: Simd> core::ops::ShrAssign for i64x4<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_i16x16(*self, rhs);
+        *self = self.simd.shrv_i64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add for u16x16<S> {
+impl<S: Simd> core::ops::Add for u64x4<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_u16x16(self, rhs)
+        self.simd.add_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for u16x16<S> {
+impl<S: Simd> core::ops::AddAssign for u64x4<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_u16x16(*self, rhs);
+        *self = self.simd.add_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<u16> for u16x16<S> {
+impl<S: Simd> core::ops::Add<u64> for u64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: u16) -> Self::Output {
-        self.simd.add_u16x16(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: u64) -> Self::Output {
+        self.simd.add_u64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<u16> for u16x16<S> {
+impl<S: Simd> core::ops::AddAssign<u64> for u64x4<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: u16) {
-        *self = self.simd.add_u16x16(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: u64) {
+        *self = self.simd.add_u64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<u16x16<S>> for u16 {
-    type Output = u16x16<S>;
+impl<S: Simd> core::ops::Add<u64x4<S>> for u64 {
+    type Output = u64x4<S>;
     #[inline(always)]
-    fn add(self, rhs: u16x16<S>) -> Self::Output {
-        rhs.simd.add_u16x16(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: u64x4<S>) -> Self::Output {
+        rhs.simd.add_u64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for u16x16<S> {
+impl<S: Simd> core::ops::Sub for u64x4<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_u16x16(self, rhs)
+        self.simd.sub_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for u16x16<S> {
+impl<S: Simd> core::ops::SubAssign for u64x4<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_u16x16(*self, rhs);
+        *self = self.simd.sub_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<u16> for u16x16<S> {
+impl<S: Simd> core::ops::Sub<u64> for u64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: u16) -> Self::Output {
-        self.simd.sub_u16x16(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: u64) -> Self::Output {
+        self.simd.sub_u64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<u16> for u16x16<S> {
+impl<S: Simd> core::ops::SubAssign<u64> for u64x4<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: u16) {
-        *self = self.simd.sub_u16x16(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: u64) {
+        *self = self.simd.sub_u64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<u16x16<S>> for u16 {
-    type Output = u16x16<S>;
+impl<S: Simd> core::ops::Sub<u64x4<S>> for u64 {
+    type Output = u64x4<S>;
     #[inline(always)]
-    fn sub(self, rhs: u16x16<S>) -> Self::Output {
-        rhs.simd.sub_u16x16(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: u64x4<S>) -> Self::Output {
+        rhs.simd.sub_u64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for u16x16<S> {
+impl<S: Simd> core::ops::Mul for u64x4<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_u16x16(self, rhs)
+        self.simd.mul_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for u16x16<S> {
+impl<S: Simd> core::ops::MulAssign for u64x4<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_u16x16(*self, rhs);
+        *self = self.simd.mul_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<u16> for u16x16<S> {
+impl<S: Simd> core::ops::Mul<u64> for u64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: u16) -> Self::Output {
-        self.simd.mul_u16x16(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: u64) -> Self::Output {
+        self.simd.mul_u64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<u16> for u16x16<S> {
+impl<S: Simd> core::ops::MulAssign<u64> for u64x4<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: u16) {
-        *self = self.simd.mul_u16x16(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: u64) {
+        *self = self.simd.mul_u64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<u16x16<S>> for u16 {
-    type Output = u16x16<S>;
+impl<S: Simd> core::ops::Mul<u64x4<S>> for u64 {
+    type Output = u64x4<S>;
     #[inline(always)]
-    fn mul(self, rhs: u16x16<S>) -> Self::Output {
-        rhs.simd.mul_u16x16(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: u64x4<S>) -> Self::Output {
+        rhs.simd.mul_u64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for u16x16<S> {
+impl<S: Simd> core::ops::BitAnd for u64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_u16x16(self, rhs)
+        self.simd.and_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for u16x16<S> {
+impl<S: Simd> core::ops::BitAndAssign for u64x4<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_u16x16(*self, rhs);
+        *self = self.simd.and_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<u16> for u16x16<S> {
+impl<S: Simd> core::ops::BitAnd<u64> for u64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: u16) -> Self::Output {
-        self.simd.and_u16x16(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: u64) -> Self::Output {
+        self.simd.and_u64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<u16> for u16x16<S> {
+impl<S: Simd> core::ops::BitAndAssign<u64> for u64x4<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: u16) {
-        *self = self.simd.and_u16x16(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: u64) {
+        *self = self.simd.and_u64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<u16x16<S>> for u16 {
-    type Output = u16x16<S>;
+impl<S: Simd> core::ops::BitAnd<u64x4<S>> for u64 {
+    type Output = u64x4<S>;
     #[inline(always)]
-    fn bitand(self, rhs: u16x16<S>) -> Self::Output {
-        rhs.simd.and_u16x16(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: u64x4<S>) -> Self::Output {
+        rhs.simd.and_u64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for u16x16<S> {
+impl<S: Simd> core::ops::BitOr for u64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_u16x16(self, rhs)
+        self.simd.or_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for u16x16<S> {
+impl<S: Simd> core::ops::BitOrAssign for u64x4<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_u16x16(*self, rhs);
+        *self = self.simd.or_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<u16> for u16x16<S> {
+impl<S: Simd> core::ops::BitOr<u64> for u64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: u16) -> Self::Output {
-        self.simd.or_u16x16(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: u64) -> Self::Output {
+        self.simd.or_u64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<u16> for u16x16<S> {
+impl<S: Simd> core::ops::BitOrAssign<u64> for u64x4<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: u16) {
-        *self = self.simd.or_u16x16(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: u64) {
+        *self = self.simd.or_u64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<u16x16<S>> for u16 {
-    type Output = u16x16<S>;
+impl<S: Simd> core::ops::BitOr<u64x4<S>> for u64 {
+    type Output = u64x4<S>;
     #[inline(always)]
-    fn bitor(self, rhs: u16x16<S>) -> Self::Output {
-        rhs.simd.or_u16x16(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: u64x4<S>) -> Self::Output {
+        rhs.simd.or_u64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for u16x16<S> {
+impl<S: Simd> core::ops::BitXor for u64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_u16x16(self, rhs)
+        self.simd.xor_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for u16x16<S> {
+impl<S: Simd> core::ops::BitXorAssign for u64x4<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_u16x16(*self, rhs);
+        *self = self.simd.xor_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<u16> for u16x16<S> {
+impl<S: Simd> core::ops::BitXor<u64> for u64x4<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: u16) -> Self::Output {
-        self.simd.xor_u16x16(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: u64) -> Self::Output {
+        self.simd.xor_u64x4(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<u16> for u16x16<S> {
+impl<S: Simd> core::ops::BitXorAssign<u64> for u64x4<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: u16) {
-        *self = self.simd.xor_u16x16(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: u64) {
+        *self = self.simd.xor_u64x4(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<u16x16<S>> for u16 {
-    type Output = u16x16<S>;
+impl<S: Simd> core::ops::BitXor<u64x4<S>> for u64 {
+    type Output = u64x4<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: u16x16<S>) -> Self::Output {
-        rhs.simd.xor_u16x16(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: u64x4<S>) -> Self::Output {
+        rhs.simd.xor_u64x4(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for u16x16<S> {
+impl<S: Simd> core::ops::Not for u64x4<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_u16x16(self)
+        self.simd.not_u64x4(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for u16x16<S> {
+impl<S: Simd> core::ops::Shl<u32> for u64x4<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_u16x16(self, rhs)
+        self.simd.shl_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for u16x16<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for u64x4<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_u16x16(*self, rhs);
+        *self = self.simd.shl_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for u16x16<S> {
+impl<S: Simd> core::ops::Shl for u64x4<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_u16x16(self, rhs)
+        self.simd.shlv_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for u16x16<S> {
+impl<S: Simd> core::ops::ShlAssign for u64x4<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_u16x16(*self, rhs);
+        *self = self.simd.shlv_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for u16x16<S> {
+impl<S: Simd> core::ops::Shr<u32> for u64x4<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_u16x16(self, rhs)
+        self.simd.shr_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for u16x16<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for u64x4<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_u16x16(*self, rhs);
+        *self = self.simd.shr_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for u16x16<S> {
+impl<S: Simd> core::ops::Shr for u64x4<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_u16x16(self, rhs)
+        self.simd.shrv_u64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for u16x16<S> {
+impl<S: Simd> core::ops::ShrAssign for u64x4<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_u16x16(*self, rhs);
+        *self = self.simd.shrv_u64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd for mask16x16<S> {
+impl<S: Simd> core::ops::BitAnd for mask64x4<S> {
     type Output = Self;
     #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask16x16(self, rhs)
+        self.simd.and_mask64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for mask16x16<S> {
+impl<S: Simd> core::ops::BitAndAssign for mask64x4<S> {
     #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask16x16(*self, rhs);
+        *self = self.simd.and_mask64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr for mask16x16<S> {
+impl<S: Simd> core::ops::BitOr for mask64x4<S> {
     type Output = Self;
     #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask16x16(self, rhs)
+        self.simd.or_mask64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for mask16x16<S> {
+impl<S: Simd> core::ops::BitOrAssign for mask64x4<S> {
     #[doc = "Compute the logical OR of two masks."]
-    #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask16x16(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::BitXor for mask16x16<S> {
-    type Output = Self;
-    #[doc = "Compute the logical XOR of two masks."]
-    #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask16x16(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::BitXorAssign for mask16x16<S> {
-    #[doc = "Compute the logical XOR of two masks."]
-    #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask16x16(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Not for mask16x16<S> {
-    type Output = Self;
-    #[doc = "Compute the logical NOT of the mask."]
-    #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_mask16x16(self)
-    }
-}
-impl<S: Simd> core::ops::Neg for i32x8<S> {
-    type Output = Self;
-    #[doc = "Negate each element of the vector, wrapping on overflow."]
-    #[inline(always)]
-    fn neg(self) -> Self::Output {
-        self.simd.neg_i32x8(self)
-    }
-}
-impl<S: Simd> core::ops::Add for i32x8<S> {
-    type Output = Self;
-    #[doc = "Add two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_i32x8(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::AddAssign for i32x8<S> {
-    #[doc = "Add two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_i32x8(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Add<i32> for i32x8<S> {
-    type Output = Self;
-    #[inline(always)]
-    fn add(self, rhs: i32) -> Self::Output {
-        self.simd.add_i32x8(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::AddAssign<i32> for i32x8<S> {
-    #[inline(always)]
-    fn add_assign(&mut self, rhs: i32) {
-        *self = self.simd.add_i32x8(*self, rhs.simd_into(self.simd));
-    }
-}
-impl<S: Simd> core::ops::Add<i32x8<S>> for i32 {
-    type Output = i32x8<S>;
-    #[inline(always)]
-    fn add(self, rhs: i32x8<S>) -> Self::Output {
-        rhs.simd.add_i32x8(self.simd_into(rhs.simd), rhs)
-    }
-}
-impl<S: Simd> core::ops::Sub for i32x8<S> {
-    type Output = Self;
-    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_i32x8(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::SubAssign for i32x8<S> {
-    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_i32x8(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Sub<i32> for i32x8<S> {
-    type Output = Self;
-    #[inline(always)]
-    fn sub(self, rhs: i32) -> Self::Output {
-        self.simd.sub_i32x8(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::SubAssign<i32> for i32x8<S> {
-    #[inline(always)]
-    fn sub_assign(&mut self, rhs: i32) {
-        *self = self.simd.sub_i32x8(*self, rhs.simd_into(self.simd));
-    }
-}
-impl<S: Simd> core::ops::Sub<i32x8<S>> for i32 {
-    type Output = i32x8<S>;
-    #[inline(always)]
-    fn sub(self, rhs: i32x8<S>) -> Self::Output {
-        rhs.simd.sub_i32x8(self.simd_into(rhs.simd), rhs)
-    }
-}
-impl<S: Simd> core::ops::Mul for i32x8<S> {
-    type Output = Self;
-    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_i32x8(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::MulAssign for i32x8<S> {
-    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_i32x8(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Mul<i32> for i32x8<S> {
-    type Output = Self;
-    #[inline(always)]
-    fn mul(self, rhs: i32) -> Self::Output {
-        self.simd.mul_i32x8(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::MulAssign<i32> for i32x8<S> {
-    #[inline(always)]
-    fn mul_assign(&mut self, rhs: i32) {
-        *self = self.simd.mul_i32x8(*self, rhs.simd_into(self.simd));
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_mask64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<i32x8<S>> for i32 {
-    type Output = i32x8<S>;
+impl<S: Simd> core::ops::BitXor for mask64x4<S> {
+    type Output = Self;
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn mul(self, rhs: i32x8<S>) -> Self::Output {
-        rhs.simd.mul_i32x8(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_mask64x4(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for i32x8<S> {
-    type Output = Self;
-    #[doc = "Compute the bitwise AND of two vectors."]
+impl<S: Simd> core::ops::BitXorAssign for mask64x4<S> {
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_i32x8(self, rhs)
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_mask64x4(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for i32x8<S> {
-    #[doc = "Compute the bitwise AND of two vectors."]
+impl<S: Simd> core::ops::Not for mask64x4<S> {
+    type Output = Self;
+    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_i32x8(*self, rhs);
+    fn not(self) -> Self::Output {
+        self.simd.not_mask64x4(self)
     }
 }
-impl<S: Simd> core::ops::BitAnd<i32> for i32x8<S> {
+impl<S: Simd> core::ops::Neg for f32x16<S> {
     type Output = Self;
+    #[doc = "Negate each element of the vector."]
     #[inline(always)]
-    fn bitand(self, rhs: i32) -> Self::Output {
-        self.simd.and_i32x8(self, rhs.simd_into(self.simd))
+    fn neg(self) -> Self::Output {
+        self.simd.neg_f32x16(self)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<i32> for i32x8<S> {
+impl<S: Simd> core::ops::Add for f32x16<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: i32) {
-        *self = self.simd.and_i32x8(*self, rhs.simd_into(self.simd));
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_f32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd<i32x8<S>> for i32 {
-    type Output = i32x8<S>;
+impl<S: Simd> core::ops::AddAssign for f32x16<S> {
+    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn bitand(self, rhs: i32x8<S>) -> Self::Output {
-        rhs.simd.and_i32x8(self.simd_into(rhs.simd), rhs)
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_f32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr for i32x8<S> {
+impl<S: Simd> core::ops::Add<f32> for f32x16<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_i32x8(self, rhs)
+    fn add(self, rhs: f32) -> Self::Output {
+        self.simd.add_f32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for i32x8<S> {
-    #[doc = "Compute the bitwise OR of two vectors."]
+impl<S: Simd> core::ops::AddAssign<f32> for f32x16<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_i32x8(*self, rhs);
+    fn add_assign(&mut self, rhs: f32) {
+        *self = self.simd.add_f32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<i32> for i32x8<S> {
-    type Output = Self;
+impl<S: Simd> core::ops::Add<f32x16<S>> for f32 {
+    type Output = f32x16<S>;
     #[inline(always)]
-    fn bitor(self, rhs: i32) -> Self::Output {
-        self.simd.or_i32x8(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: f32x16<S>) -> Self::Output {
+        rhs.simd.add_f32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<i32> for i32x8<S> {
+impl<S: Simd> core::ops::Sub for f32x16<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: i32) {
-        *self = self.simd.or_i32x8(*self, rhs.simd_into(self.simd));
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_f32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr<i32x8<S>> for i32 {
-    type Output = i32x8<S>;
+impl<S: Simd> core::ops::SubAssign for f32x16<S> {
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn bitor(self, rhs: i32x8<S>) -> Self::Output {
-        rhs.simd.or_i32x8(self.simd_into(rhs.simd), rhs)
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_f32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor for i32x8<S> {
+impl<S: Simd> core::ops::Sub<f32> for f32x16<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_i32x8(self, rhs)
+    fn sub(self, rhs: f32) -> Self::Output {
+        self.simd.sub_f32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for i32x8<S> {
-    #[doc = "Compute the bitwise XOR of two vectors."]
+impl<S: Simd> core::ops::SubAssign<f32> for f32x16<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_i32x8(*self, rhs);
+    fn sub_assign(&mut self, rhs: f32) {
+        *self = self.simd.sub_f32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<i32> for i32x8<S> {
-    type Output = Self;
+impl<S: Simd> core::ops::Sub<f32x16<S>> for f32 {
+    type Output = f32x16<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: i32) -> Self::Output {
-        self.simd.xor_i32x8(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: f32x16<S>) -> Self::Output {
+        rhs.simd.sub_f32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<i32> for i32x8<S> {
+impl<S: Simd> core::ops::Mul for f32x16<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: i32) {
-        *self = self.simd.xor_i32x8(*self, rhs.simd_into(self.simd));
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_f32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor<i32x8<S>> for i32 {
-    type Output = i32x8<S>;
+impl<S: Simd> core::ops::MulAssign for f32x16<S> {
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitxor(self, rhs: i32x8<S>) -> Self::Output {
-        rhs.simd.xor_i32x8(self.simd_into(rhs.simd), rhs)
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_f32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Not for i32x8<S> {
+impl<S: Simd> core::ops::Mul<f32> for f32x16<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_i32x8(self)
+    fn mul(self, rhs: f32) -> Self::Output {
+        self.simd.mul_f32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for i32x8<S> {
-    type Output = Self;
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+impl<S: Simd> core::ops::MulAssign<f32> for f32x16<S> {
     #[inline(always)]
-    fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_i32x8(self, rhs)
+    fn mul_assign(&mut self, rhs: f32) {
+        *self = self.simd.mul_f32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for i32x8<S> {
+impl<S: Simd> core::ops::Mul<f32x16<S>> for f32 {
+    type Output = f32x16<S>;
     #[inline(always)]
-    fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_i32x8(*self, rhs);
+    fn mul(self, rhs: f32x16<S>) -> Self::Output {
+        rhs.simd.mul_f32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Shl for i32x8<S> {
+impl<S: Simd> core::ops::Div for f32x16<S> {
     type Output = Self;
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_i32x8(self, rhs)
+    fn div(self, rhs: Self) -> Self::Output {
+        self.simd.div_f32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for i32x8<S> {
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::DivAssign for f32x16<S> {
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_i32x8(*self, rhs);
+    fn div_assign(&mut self, rhs: Self) {
+        *self = self.simd.div_f32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for i32x8<S> {
+impl<S: Simd> core::ops::Div<f32> for f32x16<S> {
     type Output = Self;
-    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
-    fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_i32x8(self, rhs)
+    fn div(self, rhs: f32) -> Self::Output {
+        self.simd.div_f32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for i32x8<S> {
+impl<S: Simd> core::ops::DivAssign<f32> for f32x16<S> {
     #[inline(always)]
-    fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_i32x8(*self, rhs);
+    fn div_assign(&mut self, rhs: f32) {
+        *self = self.simd.div_f32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Shr for i32x8<S> {
-    type Output = Self;
-    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::Div<f32x16<S>> for f32 {
+    type Output = f32x16<S>;
     #[inline(always)]
-    fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_i32x8(self, rhs)
+    fn div(self, rhs: f32x16<S>) -> Self::Output {
+        rhs.simd.div_f32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for i32x8<S> {
-    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::Neg for i8x64<S> {
+    type Output = Self;
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
     #[inline(always)]
-    fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_i32x8(*self, rhs);
+    fn neg(self) -> Self::Output {
+        self.simd.neg_i8x64(self)
     }
 }
-impl<S: Simd> core::ops::Add for u32x8<S> {
+impl<S: Simd> core::ops::Add for i8x64<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_u32x8(self, rhs)
+        self.simd.add_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for u32x8<S> {
+impl<S: Simd> core::ops::AddAssign for i8x64<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_u32x8(*self, rhs);
+        *self = self.simd.add_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<u32> for u32x8<S> {
+impl<S: Simd> core::ops::Add<i8> for i8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: u32) -> Self::Output {
-        self.simd.add_u32x8(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: i8) -> Self::Output {
+        self.simd.add_i8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::AddAssign<i8> for i8x64<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: u32) {
-        *self = self.simd.add_u32x8(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: i8) {
+        *self = self.simd.add_i8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<u32x8<S>> for u32 {
-    type Output = u32x8<S>;
+impl<S: Simd> core::ops::Add<i8x64<S>> for i8 {
+    type Output = i8x64<S>;
     #[inline(always)]
-    fn add(self, rhs: u32x8<S>) -> Self::Output {
-        rhs.simd.add_u32x8(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: i8x64<S>) -> Self::Output {
+        rhs.simd.add_i8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for u32x8<S> {
+impl<S: Simd> core::ops::Sub for i8x64<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_u32x8(self, rhs)
+        self.simd.sub_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for u32x8<S> {
+impl<S: Simd> core::ops::SubAssign for i8x64<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_u32x8(*self, rhs);
+        *self = self.simd.sub_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<u32> for u32x8<S> {
+impl<S: Simd> core::ops::Sub<i8> for i8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: u32) -> Self::Output {
-        self.simd.sub_u32x8(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: i8) -> Self::Output {
+        self.simd.sub_i8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::SubAssign<i8> for i8x64<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: u32) {
-        *self = self.simd.sub_u32x8(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: i8) {
+        *self = self.simd.sub_i8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<u32x8<S>> for u32 {
-    type Output = u32x8<S>;
+impl<S: Simd> core::ops::Sub<i8x64<S>> for i8 {
+    type Output = i8x64<S>;
     #[inline(always)]
-    fn sub(self, rhs: u32x8<S>) -> Self::Output {
-        rhs.simd.sub_u32x8(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: i8x64<S>) -> Self::Output {
+        rhs.simd.sub_i8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for u32x8<S> {
+impl<S: Simd> core::ops::Mul for i8x64<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_u32x8(self, rhs)
+        self.simd.mul_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for u32x8<S> {
+impl<S: Simd> core::ops::MulAssign for i8x64<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_u32x8(*self, rhs);
+        *self = self.simd.mul_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<u32> for u32x8<S> {
+impl<S: Simd> core::ops::Mul<i8> for i8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: u32) -> Self::Output {
-        self.simd.mul_u32x8(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: i8) -> Self::Output {
+        self.simd.mul_i8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::MulAssign<i8> for i8x64<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: u32) {
-        *self = self.simd.mul_u32x8(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: i8) {
+        *self = self.simd.mul_i8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<u32x8<S>> for u32 {
-    type Output = u32x8<S>;
+impl<S: Simd> core::ops::Mul<i8x64<S>> for i8 {
+    type Output = i8x64<S>;
     #[inline(always)]
-    fn mul(self, rhs: u32x8<S>) -> Self::Output {
-        rhs.simd.mul_u32x8(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: i8x64<S>) -> Self::Output {
+        rhs.simd.mul_i8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for u32x8<S> {
+impl<S: Simd> core::ops::BitAnd for i8x64<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_u32x8(self, rhs)
+        self.simd.and_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for u32x8<S> {
+impl<S: Simd> core::ops::BitAndAssign for i8x64<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_u32x8(*self, rhs);
+        *self = self.simd.and_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<u32> for u32x8<S> {
+impl<S: Simd> core::ops::BitAnd<i8> for i8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: u32) -> Self::Output {
-        self.simd.and_u32x8(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: i8) -> Self::Output {
+        self.simd.and_i8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::BitAndAssign<i8> for i8x64<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: u32) {
-        *self = self.simd.and_u32x8(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: i8) {
+        *self = self.simd.and_i8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<u32x8<S>> for u32 {
-    type Output = u32x8<S>;
+impl<S: Simd> core::ops::BitAnd<i8x64<S>> for i8 {
+    type Output = i8x64<S>;
     #[inline(always)]
-    fn bitand(self, rhs: u32x8<S>) -> Self::Output {
-        rhs.simd.and_u32x8(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: i8x64<S>) -> Self::Output {
+        rhs.simd.and_i8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for u32x8<S> {
+impl<S: Simd> core::ops::BitOr for i8x64<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_u32x8(self, rhs)
+        self.simd.or_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for u32x8<S> {
+impl<S: Simd> core::ops::BitOrAssign for i8x64<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_u32x8(*self, rhs);
+        *self = self.simd.or_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<u32> for u32x8<S> {
+impl<S: Simd> core::ops::BitOr<i8> for i8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: u32) -> Self::Output {
-        self.simd.or_u32x8(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: i8) -> Self::Output {
+        self.simd.or_i8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::BitOrAssign<i8> for i8x64<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: u32) {
-        *self = self.simd.or_u32x8(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: i8) {
+        *self = self.simd.or_i8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<u32x8<S>> for u32 {
-    type Output = u32x8<S>;
+impl<S: Simd> core::ops::BitOr<i8x64<S>> for i8 {
+    type Output = i8x64<S>;
     #[inline(always)]
-    fn bitor(self, rhs: u32x8<S>) -> Self::Output {
-        rhs.simd.or_u32x8(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: i8x64<S>) -> Self::Output {
+        rhs.simd.or_i8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for u32x8<S> {
+impl<S: Simd> core::ops::BitXor for i8x64<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_u32x8(self, rhs)
+        self.simd.xor_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for u32x8<S> {
+impl<S: Simd> core::ops::BitXorAssign for i8x64<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_u32x8(*self, rhs);
+        *self = self.simd.xor_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<u32> for u32x8<S> {
+impl<S: Simd> core::ops::BitXor<i8> for i8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: u32) -> Self::Output {
-        self.simd.xor_u32x8(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: i8) -> Self::Output {
+        self.simd.xor_i8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::BitXorAssign<i8> for i8x64<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: u32) {
-        *self = self.simd.xor_u32x8(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: i8) {
+        *self = self.simd.xor_i8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<u32x8<S>> for u32 {
-    type Output = u32x8<S>;
+impl<S: Simd> core::ops::BitXor<i8x64<S>> for i8 {
+    type Output = i8x64<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: u32x8<S>) -> Self::Output {
-        rhs.simd.xor_u32x8(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: i8x64<S>) -> Self::Output {
+        rhs.simd.xor_i8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for u32x8<S> {
+impl<S: Simd> core::ops::Not for i8x64<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_u32x8(self)
+        self.simd.not_i8x64(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for u32x8<S> {
+impl<S: Simd> core::ops::Shl<u32> for i8x64<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_u32x8(self, rhs)
+        self.simd.shl_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for i8x64<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_u32x8(*self, rhs);
+        *self = self.simd.shl_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for u32x8<S> {
+impl<S: Simd> core::ops::Shl for i8x64<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_u32x8(self, rhs)
+        self.simd.shlv_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for u32x8<S> {
+impl<S: Simd> core::ops::ShlAssign for i8x64<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_u32x8(*self, rhs);
+        *self = self.simd.shlv_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for u32x8<S> {
+impl<S: Simd> core::ops::Shr<u32> for i8x64<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_u32x8(self, rhs)
+        self.simd.shr_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for u32x8<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for i8x64<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_u32x8(*self, rhs);
+        *self = self.simd.shr_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for u32x8<S> {
+impl<S: Simd> core::ops::Shr for i8x64<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_u32x8(self, rhs)
+        self.simd.shrv_i8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for u32x8<S> {
+impl<S: Simd> core::ops::ShrAssign for i8x64<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_u32x8(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::BitAnd for mask32x8<S> {
-    type Output = Self;
-    #[doc = "Compute the logical AND of two masks."]
-    #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask32x8(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::BitAndAssign for mask32x8<S> {
-    #[doc = "Compute the logical AND of two masks."]
-    #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask32x8(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::BitOr for mask32x8<S> {
-    type Output = Self;
-    #[doc = "Compute the logical OR of two masks."]
-    #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask32x8(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::BitOrAssign for mask32x8<S> {
-    #[doc = "Compute the logical OR of two masks."]
-    #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask32x8(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::BitXor for mask32x8<S> {
-    type Output = Self;
-    #[doc = "Compute the logical XOR of two masks."]
-    #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask32x8(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::BitXorAssign for mask32x8<S> {
-    #[doc = "Compute the logical XOR of two masks."]
-    #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask32x8(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Not for mask32x8<S> {
-    type Output = Self;
-    #[doc = "Compute the logical NOT of the mask."]
-    #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_mask32x8(self)
-    }
-}
-impl<S: Simd> core::ops::Neg for f64x4<S> {
-    type Output = Self;
-    #[doc = "Negate each element of the vector."]
-    #[inline(always)]
-    fn neg(self) -> Self::Output {
-        self.simd.neg_f64x4(self)
+        *self = self.simd.shrv_i8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add for f64x4<S> {
+impl<S: Simd> core::ops::Add for u8x64<S> {
     type Output = Self;
-    #[doc = "Add two vectors element-wise."]
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_f64x4(self, rhs)
+        self.simd.add_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for f64x4<S> {
-    #[doc = "Add two vectors element-wise."]
+impl<S: Simd> core::ops::AddAssign for u8x64<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_f64x4(*self, rhs);
+        *self = self.simd.add_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<f64> for f64x4<S> {
+impl<S: Simd> core::ops::Add<u8> for u8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: f64) -> Self::Output {
-        self.simd.add_f64x4(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: u8) -> Self::Output {
+        self.simd.add_u8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<f64> for f64x4<S> {
+impl<S: Simd> core::ops::AddAssign<u8> for u8x64<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: f64) {
-        *self = self.simd.add_f64x4(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: u8) {
+        *self = self.simd.add_u8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<f64x4<S>> for f64 {
-    type Output = f64x4<S>;
+impl<S: Simd> core::ops::Add<u8x64<S>> for u8 {
+    type Output = u8x64<S>;
     #[inline(always)]
-    fn add(self, rhs: f64x4<S>) -> Self::Output {
-        rhs.simd.add_f64x4(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: u8x64<S>) -> Self::Output {
+        rhs.simd.add_u8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for f64x4<S> {
+impl<S: Simd> core::ops::Sub for u8x64<S> {
     type Output = Self;
-    #[doc = "Subtract two vectors element-wise."]
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_f64x4(self, rhs)
+        self.simd.sub_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for f64x4<S> {
-    #[doc = "Subtract two vectors element-wise."]
+impl<S: Simd> core::ops::SubAssign for u8x64<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_f64x4(*self, rhs);
+        *self = self.simd.sub_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<f64> for f64x4<S> {
+impl<S: Simd> core::ops::Sub<u8> for u8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: f64) -> Self::Output {
-        self.simd.sub_f64x4(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: u8) -> Self::Output {
+        self.simd.sub_u8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<f64> for f64x4<S> {
+impl<S: Simd> core::ops::SubAssign<u8> for u8x64<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: f64) {
-        *self = self.simd.sub_f64x4(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: u8) {
+        *self = self.simd.sub_u8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<f64x4<S>> for f64 {
-    type Output = f64x4<S>;
+impl<S: Simd> core::ops::Sub<u8x64<S>> for u8 {
+    type Output = u8x64<S>;
     #[inline(always)]
-    fn sub(self, rhs: f64x4<S>) -> Self::Output {
-        rhs.simd.sub_f64x4(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: u8x64<S>) -> Self::Output {
+        rhs.simd.sub_u8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for f64x4<S> {
+impl<S: Simd> core::ops::Mul for u8x64<S> {
     type Output = Self;
-    #[doc = "Multiply two vectors element-wise."]
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_f64x4(self, rhs)
+        self.simd.mul_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for f64x4<S> {
-    #[doc = "Multiply two vectors element-wise."]
+impl<S: Simd> core::ops::MulAssign for u8x64<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_f64x4(*self, rhs);
+        *self = self.simd.mul_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<f64> for f64x4<S> {
+impl<S: Simd> core::ops::Mul<u8> for u8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: f64) -> Self::Output {
-        self.simd.mul_f64x4(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: u8) -> Self::Output {
+        self.simd.mul_u8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<f64> for f64x4<S> {
+impl<S: Simd> core::ops::MulAssign<u8> for u8x64<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: f64) {
-        *self = self.simd.mul_f64x4(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: u8) {
+        *self = self.simd.mul_u8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<f64x4<S>> for f64 {
-    type Output = f64x4<S>;
+impl<S: Simd> core::ops::Mul<u8x64<S>> for u8 {
+    type Output = u8x64<S>;
     #[inline(always)]
-    fn mul(self, rhs: f64x4<S>) -> Self::Output {
-        rhs.simd.mul_f64x4(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: u8x64<S>) -> Self::Output {
+        rhs.simd.mul_u8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Div for f64x4<S> {
+impl<S: Simd> core::ops::BitAnd for u8x64<S> {
     type Output = Self;
-    #[doc = "Divide two vectors element-wise."]
+    #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
-    fn div(self, rhs: Self) -> Self::Output {
-        self.simd.div_f64x4(self, rhs)
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::DivAssign for f64x4<S> {
-    #[doc = "Divide two vectors element-wise."]
+impl<S: Simd> core::ops::BitAndAssign for u8x64<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
-    fn div_assign(&mut self, rhs: Self) {
-        *self = self.simd.div_f64x4(*self, rhs);
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Div<f64> for f64x4<S> {
+impl<S: Simd> core::ops::BitAnd<u8> for u8x64<S> {
     type Output = Self;
     #[inline(always)]
-    fn div(self, rhs: f64) -> Self::Output {
-        self.simd.div_f64x4(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: u8) -> Self::Output {
+        self.simd.and_u8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::DivAssign<f64> for f64x4<S> {
+impl<S: Simd> core::ops::BitAndAssign<u8> for u8x64<S> {
     #[inline(always)]
-    fn div_assign(&mut self, rhs: f64) {
-        *self = self.simd.div_f64x4(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: u8) {
+        *self = self.simd.and_u8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Div<f64x4<S>> for f64 {
-    type Output = f64x4<S>;
+impl<S: Simd> core::ops::BitAnd<u8x64<S>> for u8 {
+    type Output = u8x64<S>;
     #[inline(always)]
-    fn div(self, rhs: f64x4<S>) -> Self::Output {
-        rhs.simd.div_f64x4(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: u8x64<S>) -> Self::Output {
+        rhs.simd.and_u8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for mask64x4<S> {
+impl<S: Simd> core::ops::BitOr for u8x64<S> {
     type Output = Self;
-    #[doc = "Compute the logical AND of two masks."]
+    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask64x4(self, rhs)
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for mask64x4<S> {
-    #[doc = "Compute the logical AND of two masks."]
+impl<S: Simd> core::ops::BitOrAssign for u8x64<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask64x4(*self, rhs);
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr for mask64x4<S> {
+impl<S: Simd> core::ops::BitOr<u8> for u8x64<S> {
     type Output = Self;
-    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask64x4(self, rhs)
+    fn bitor(self, rhs: u8) -> Self::Output {
+        self.simd.or_u8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for mask64x4<S> {
-    #[doc = "Compute the logical OR of two masks."]
+impl<S: Simd> core::ops::BitOrAssign<u8> for u8x64<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask64x4(*self, rhs);
+    fn bitor_assign(&mut self, rhs: u8) {
+        *self = self.simd.or_u8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor for mask64x4<S> {
-    type Output = Self;
-    #[doc = "Compute the logical XOR of two masks."]
+impl<S: Simd> core::ops::BitOr<u8x64<S>> for u8 {
+    type Output = u8x64<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask64x4(self, rhs)
+    fn bitor(self, rhs: u8x64<S>) -> Self::Output {
+        rhs.simd.or_u8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for mask64x4<S> {
-    #[doc = "Compute the logical XOR of two masks."]
+impl<S: Simd> core::ops::BitXor for u8x64<S> {
+    type Output = Self;
+    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask64x4(*self, rhs);
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Not for mask64x4<S> {
-    type Output = Self;
-    #[doc = "Compute the logical NOT of the mask."]
+impl<S: Simd> core::ops::BitXorAssign for u8x64<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_mask64x4(self)
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Neg for f32x16<S> {
+impl<S: Simd> core::ops::BitXor<u8> for u8x64<S> {
     type Output = Self;
-    #[doc = "Negate each element of the vector."]
     #[inline(always)]
-    fn neg(self) -> Self::Output {
-        self.simd.neg_f32x16(self)
+    fn bitxor(self, rhs: u8) -> Self::Output {
+        self.simd.xor_u8x64(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Add for f32x16<S> {
-    type Output = Self;
-    #[doc = "Add two vectors element-wise."]
+impl<S: Simd> core::ops::BitXorAssign<u8> for u8x64<S> {
     #[inline(always)]
-    fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_f32x16(self, rhs)
+    fn bitxor_assign(&mut self, rhs: u8) {
+        *self = self.simd.xor_u8x64(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::AddAssign for f32x16<S> {
-    #[doc = "Add two vectors element-wise."]
+impl<S: Simd> core::ops::BitXor<u8x64<S>> for u8 {
+    type Output = u8x64<S>;
     #[inline(always)]
-    fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_f32x16(*self, rhs);
+    fn bitxor(self, rhs: u8x64<S>) -> Self::Output {
+        rhs.simd.xor_u8x64(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Add<f32> for f32x16<S> {
+impl<S: Simd> core::ops::Not for u8x64<S> {
     type Output = Self;
+    #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
-    fn add(self, rhs: f32) -> Self::Output {
-        self.simd.add_f32x16(self, rhs.simd_into(self.simd))
+    fn not(self) -> Self::Output {
+        self.simd.not_u8x64(self)
     }
 }
-impl<S: Simd> core::ops::AddAssign<f32> for f32x16<S> {
+impl<S: Simd> core::ops::Shl<u32> for u8x64<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
-    fn add_assign(&mut self, rhs: f32) {
-        *self = self.simd.add_f32x16(*self, rhs.simd_into(self.simd));
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Add<f32x16<S>> for f32 {
-    type Output = f32x16<S>;
+impl<S: Simd> core::ops::ShlAssign<u32> for u8x64<S> {
     #[inline(always)]
-    fn add(self, rhs: f32x16<S>) -> Self::Output {
-        rhs.simd.add_f32x16(self.simd_into(rhs.simd), rhs)
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub for f32x16<S> {
+impl<S: Simd> core::ops::Shl for u8x64<S> {
     type Output = Self;
-    #[doc = "Subtract two vectors element-wise."]
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_f32x16(self, rhs)
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for f32x16<S> {
-    #[doc = "Subtract two vectors element-wise."]
+impl<S: Simd> core::ops::ShlAssign for u8x64<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_f32x16(*self, rhs);
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<f32> for f32x16<S> {
+impl<S: Simd> core::ops::Shr<u32> for u8x64<S> {
     type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
-    fn sub(self, rhs: f32) -> Self::Output {
-        self.simd.sub_f32x16(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::SubAssign<f32> for f32x16<S> {
-    #[inline(always)]
-    fn sub_assign(&mut self, rhs: f32) {
-        *self = self.simd.sub_f32x16(*self, rhs.simd_into(self.simd));
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Sub<f32x16<S>> for f32 {
-    type Output = f32x16<S>;
+impl<S: Simd> core::ops::ShrAssign<u32> for u8x64<S> {
     #[inline(always)]
-    fn sub(self, rhs: f32x16<S>) -> Self::Output {
-        rhs.simd.sub_f32x16(self.simd_into(rhs.simd), rhs)
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul for f32x16<S> {
+impl<S: Simd> core::ops::Shr for u8x64<S> {
     type Output = Self;
-    #[doc = "Multiply two vectors element-wise."]
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_f32x16(self, rhs)
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_u8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for f32x16<S> {
-    #[doc = "Multiply two vectors element-wise."]
+impl<S: Simd> core::ops::ShrAssign for u8x64<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_f32x16(*self, rhs);
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_u8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<f32> for f32x16<S> {
+impl<S: Simd> core::ops::BitAnd for mask8x64<S> {
     type Output = Self;
+    #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
-    fn mul(self, rhs: f32) -> Self::Output {
-        self.simd.mul_f32x16(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::MulAssign<f32> for f32x16<S> {
-    #[inline(always)]
-    fn mul_assign(&mut self, rhs: f32) {
-        *self = self.simd.mul_f32x16(*self, rhs.simd_into(self.simd));
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_mask8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Mul<f32x16<S>> for f32 {
-    type Output = f32x16<S>;
+impl<S: Simd> core::ops::BitAndAssign for mask8x64<S> {
+    #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
-    fn mul(self, rhs: f32x16<S>) -> Self::Output {
-        rhs.simd.mul_f32x16(self.simd_into(rhs.simd), rhs)
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_mask8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Div for f32x16<S> {
+impl<S: Simd> core::ops::BitOr for mask8x64<S> {
     type Output = Self;
-    #[doc = "Divide two vectors element-wise."]
+    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn div(self, rhs: Self) -> Self::Output {
-        self.simd.div_f32x16(self, rhs)
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_mask8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::DivAssign for f32x16<S> {
-    #[doc = "Divide two vectors element-wise."]
+impl<S: Simd> core::ops::BitOrAssign for mask8x64<S> {
+    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn div_assign(&mut self, rhs: Self) {
-        *self = self.simd.div_f32x16(*self, rhs);
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_mask8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Div<f32> for f32x16<S> {
+impl<S: Simd> core::ops::BitXor for mask8x64<S> {
     type Output = Self;
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn div(self, rhs: f32) -> Self::Output {
-        self.simd.div_f32x16(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_mask8x64(self, rhs)
     }
 }
-impl<S: Simd> core::ops::DivAssign<f32> for f32x16<S> {
+impl<S: Simd> core::ops::BitXorAssign for mask8x64<S> {
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn div_assign(&mut self, rhs: f32) {
-        *self = self.simd.div_f32x16(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_mask8x64(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Div<f32x16<S>> for f32 {
-    type Output = f32x16<S>;
+impl<S: Simd> core::ops::Not for mask8x64<S> {
+    type Output = Self;
+    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
-    fn div(self, rhs: f32x16<S>) -> Self::Output {
-        rhs.simd.div_f32x16(self.simd_into(rhs.simd), rhs)
+    fn not(self) -> Self::Output {
+        self.simd.not_mask8x64(self)
     }
 }
-impl<S: Simd> core::ops::Neg for i8x64<S> {
+impl<S: Simd> core::ops::Neg for i16x32<S> {
     type Output = Self;
     #[doc = "Negate each element of the vector, wrapping on overflow."]
     #[inline(always)]
     fn neg(self) -> Self::Output {
-        self.simd.neg_i8x64(self)
+        self.simd.neg_i16x32(self)
     }
 }
-impl<S: Simd> core::ops::Add for i8x64<S> {
+impl<S: Simd> core::ops::Add for i16x32<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_i8x64(self, rhs)
+        self.simd.add_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for i8x64<S> {
+impl<S: Simd> core::ops::AddAssign for i16x32<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_i8x64(*self, rhs);
+        *self = self.simd.add_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<i8> for i8x64<S> {
+impl<S: Simd> core::ops::Add<i16> for i16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: i8) -> Self::Output {
-        self.simd.add_i8x64(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: i16) -> Self::Output {
+        self.simd.add_i16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<i8> for i8x64<S> {
+impl<S: Simd> core::ops::AddAssign<i16> for i16x32<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: i8) {
-        *self = self.simd.add_i8x64(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: i16) {
+        *self = self.simd.add_i16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<i8x64<S>> for i8 {
-    type Output = i8x64<S>;
+impl<S: Simd> core::ops::Add<i16x32<S>> for i16 {
+    type Output = i16x32<S>;
     #[inline(always)]
-    fn add(self, rhs: i8x64<S>) -> Self::Output {
-        rhs.simd.add_i8x64(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: i16x32<S>) -> Self::Output {
+        rhs.simd.add_i16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for i8x64<S> {
+impl<S: Simd> core::ops::Sub for i16x32<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_i8x64(self, rhs)
+        self.simd.sub_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for i8x64<S> {
+impl<S: Simd> core::ops::SubAssign for i16x32<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_i8x64(*self, rhs);
+        *self = self.simd.sub_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<i8> for i8x64<S> {
+impl<S: Simd> core::ops::Sub<i16> for i16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: i8) -> Self::Output {
-        self.simd.sub_i8x64(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: i16) -> Self::Output {
+        self.simd.sub_i16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<i8> for i8x64<S> {
+impl<S: Simd> core::ops::SubAssign<i16> for i16x32<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: i8) {
-        *self = self.simd.sub_i8x64(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: i16) {
+        *self = self.simd.sub_i16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<i8x64<S>> for i8 {
-    type Output = i8x64<S>;
+impl<S: Simd> core::ops::Sub<i16x32<S>> for i16 {
+    type Output = i16x32<S>;
     #[inline(always)]
-    fn sub(self, rhs: i8x64<S>) -> Self::Output {
-        rhs.simd.sub_i8x64(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: i16x32<S>) -> Self::Output {
+        rhs.simd.sub_i16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for i8x64<S> {
+impl<S: Simd> core::ops::Mul for i16x32<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_i8x64(self, rhs)
+        self.simd.mul_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for i8x64<S> {
+impl<S: Simd> core::ops::MulAssign for i16x32<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_i8x64(*self, rhs);
+        *self = self.simd.mul_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<i8> for i8x64<S> {
+impl<S: Simd> core::ops::Mul<i16> for i16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: i8) -> Self::Output {
-        self.simd.mul_i8x64(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: i16) -> Self::Output {
+        self.simd.mul_i16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<i8> for i8x64<S> {
+impl<S: Simd> core::ops::MulAssign<i16> for i16x32<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: i8) {
-        *self = self.simd.mul_i8x64(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: i16) {
+        *self = self.simd.mul_i16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<i8x64<S>> for i8 {
-    type Output = i8x64<S>;
+impl<S: Simd> core::ops::Mul<i16x32<S>> for i16 {
+    type Output = i16x32<S>;
     #[inline(always)]
-    fn mul(self, rhs: i8x64<S>) -> Self::Output {
-        rhs.simd.mul_i8x64(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: i16x32<S>) -> Self::Output {
+        rhs.simd.mul_i16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for i8x64<S> {
+impl<S: Simd> core::ops::BitAnd for i16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_i8x64(self, rhs)
+        self.simd.and_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for i8x64<S> {
+impl<S: Simd> core::ops::BitAndAssign for i16x32<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_i8x64(*self, rhs);
+        *self = self.simd.and_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<i8> for i8x64<S> {
+impl<S: Simd> core::ops::BitAnd<i16> for i16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: i8) -> Self::Output {
-        self.simd.and_i8x64(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: i16) -> Self::Output {
+        self.simd.and_i16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<i8> for i8x64<S> {
+impl<S: Simd> core::ops::BitAndAssign<i16> for i16x32<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: i8) {
-        *self = self.simd.and_i8x64(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: i16) {
+        *self = self.simd.and_i16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<i8x64<S>> for i8 {
-    type Output = i8x64<S>;
+impl<S: Simd> core::ops::BitAnd<i16x32<S>> for i16 {
+    type Output = i16x32<S>;
     #[inline(always)]
-    fn bitand(self, rhs: i8x64<S>) -> Self::Output {
-        rhs.simd.and_i8x64(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: i16x32<S>) -> Self::Output {
+        rhs.simd.and_i16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for i8x64<S> {
+impl<S: Simd> core::ops::BitOr for i16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_i8x64(self, rhs)
+        self.simd.or_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for i8x64<S> {
+impl<S: Simd> core::ops::BitOrAssign for i16x32<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_i8x64(*self, rhs);
+        *self = self.simd.or_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<i8> for i8x64<S> {
+impl<S: Simd> core::ops::BitOr<i16> for i16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: i8) -> Self::Output {
-        self.simd.or_i8x64(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: i16) -> Self::Output {
+        self.simd.or_i16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<i8> for i8x64<S> {
+impl<S: Simd> core::ops::BitOrAssign<i16> for i16x32<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: i8) {
-        *self = self.simd.or_i8x64(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: i16) {
+        *self = self.simd.or_i16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<i8x64<S>> for i8 {
-    type Output = i8x64<S>;
+impl<S: Simd> core::ops::BitOr<i16x32<S>> for i16 {
+    type Output = i16x32<S>;
     #[inline(always)]
-    fn bitor(self, rhs: i8x64<S>) -> Self::Output {
-        rhs.simd.or_i8x64(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: i16x32<S>) -> Self::Output {
+        rhs.simd.or_i16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for i8x64<S> {
+impl<S: Simd> core::ops::BitXor for i16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_i8x64(self, rhs)
+        self.simd.xor_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for i8x64<S> {
+impl<S: Simd> core::ops::BitXorAssign for i16x32<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_i8x64(*self, rhs);
+        *self = self.simd.xor_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<i8> for i8x64<S> {
+impl<S: Simd> core::ops::BitXor<i16> for i16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: i8) -> Self::Output {
-        self.simd.xor_i8x64(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: i16) -> Self::Output {
+        self.simd.xor_i16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<i8> for i8x64<S> {
+impl<S: Simd> core::ops::BitXorAssign<i16> for i16x32<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: i8) {
-        *self = self.simd.xor_i8x64(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: i16) {
+        *self = self.simd.xor_i16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<i8x64<S>> for i8 {
-    type Output = i8x64<S>;
+impl<S: Simd> core::ops::BitXor<i16x32<S>> for i16 {
+    type Output = i16x32<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: i8x64<S>) -> Self::Output {
-        rhs.simd.xor_i8x64(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: i16x32<S>) -> Self::Output {
+        rhs.simd.xor_i16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for i8x64<S> {
+impl<S: Simd> core::ops::Not for i16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_i8x64(self)
+        self.simd.not_i16x32(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for i8x64<S> {
+impl<S: Simd> core::ops::Shl<u32> for i16x32<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_i8x64(self, rhs)
+        self.simd.shl_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for i8x64<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for i16x32<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_i8x64(*self, rhs);
+        *self = self.simd.shl_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for i8x64<S> {
+impl<S: Simd> core::ops::Shl for i16x32<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_i8x64(self, rhs)
+        self.simd.shlv_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for i8x64<S> {
+impl<S: Simd> core::ops::ShlAssign for i16x32<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_i8x64(*self, rhs);
+        *self = self.simd.shlv_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for i8x64<S> {
+impl<S: Simd> core::ops::Shr<u32> for i16x32<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_i8x64(self, rhs)
+        self.simd.shr_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for i8x64<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for i16x32<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_i8x64(*self, rhs);
+        *self = self.simd.shr_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for i8x64<S> {
+impl<S: Simd> core::ops::Shr for i16x32<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_i8x64(self, rhs)
+        self.simd.shrv_i16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for i8x64<S> {
+impl<S: Simd> core::ops::ShrAssign for i16x32<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_i8x64(*self, rhs);
+        *self = self.simd.shrv_i16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add for u8x64<S> {
+impl<S: Simd> core::ops::Add for u16x32<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_u8x64(self, rhs)
+        self.simd.add_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for u8x64<S> {
+impl<S: Simd> core::ops::AddAssign for u16x32<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_u8x64(*self, rhs);
+        *self = self.simd.add_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<u8> for u8x64<S> {
+impl<S: Simd> core::ops::Add<u16> for u16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: u8) -> Self::Output {
-        self.simd.add_u8x64(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: u16) -> Self::Output {
+        self.simd.add_u16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<u8> for u8x64<S> {
+impl<S: Simd> core::ops::AddAssign<u16> for u16x32<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: u8) {
-        *self = self.simd.add_u8x64(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: u16) {
+        *self = self.simd.add_u16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<u8x64<S>> for u8 {
-    type Output = u8x64<S>;
+impl<S: Simd> core::ops::Add<u16x32<S>> for u16 {
+    type Output = u16x32<S>;
     #[inline(always)]
-    fn add(self, rhs: u8x64<S>) -> Self::Output {
-        rhs.simd.add_u8x64(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: u16x32<S>) -> Self::Output {
+        rhs.simd.add_u16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for u8x64<S> {
+impl<S: Simd> core::ops::Sub for u16x32<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_u8x64(self, rhs)
+        self.simd.sub_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for u8x64<S> {
+impl<S: Simd> core::ops::SubAssign for u16x32<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_u8x64(*self, rhs);
+        *self = self.simd.sub_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<u8> for u8x64<S> {
+impl<S: Simd> core::ops::Sub<u16> for u16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: u8) -> Self::Output {
-        self.simd.sub_u8x64(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: u16) -> Self::Output {
+        self.simd.sub_u16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<u8> for u8x64<S> {
+impl<S: Simd> core::ops::SubAssign<u16> for u16x32<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: u8) {
-        *self = self.simd.sub_u8x64(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: u16) {
+        *self = self.simd.sub_u16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<u8x64<S>> for u8 {
-    type Output = u8x64<S>;
+impl<S: Simd> core::ops::Sub<u16x32<S>> for u16 {
+    type Output = u16x32<S>;
     #[inline(always)]
-    fn sub(self, rhs: u8x64<S>) -> Self::Output {
-        rhs.simd.sub_u8x64(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: u16x32<S>) -> Self::Output {
+        rhs.simd.sub_u16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for u8x64<S> {
+impl<S: Simd> core::ops::Mul for u16x32<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_u8x64(self, rhs)
+        self.simd.mul_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for u8x64<S> {
+impl<S: Simd> core::ops::MulAssign for u16x32<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_u8x64(*self, rhs);
+        *self = self.simd.mul_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<u8> for u8x64<S> {
+impl<S: Simd> core::ops::Mul<u16> for u16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: u8) -> Self::Output {
-        self.simd.mul_u8x64(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: u16) -> Self::Output {
+        self.simd.mul_u16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<u8> for u8x64<S> {
+impl<S: Simd> core::ops::MulAssign<u16> for u16x32<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: u8) {
-        *self = self.simd.mul_u8x64(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: u16) {
+        *self = self.simd.mul_u16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<u8x64<S>> for u8 {
-    type Output = u8x64<S>;
+impl<S: Simd> core::ops::Mul<u16x32<S>> for u16 {
+    type Output = u16x32<S>;
     #[inline(always)]
-    fn mul(self, rhs: u8x64<S>) -> Self::Output {
-        rhs.simd.mul_u8x64(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: u16x32<S>) -> Self::Output {
+        rhs.simd.mul_u16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for u8x64<S> {
+impl<S: Simd> core::ops::BitAnd for u16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_u8x64(self, rhs)
+        self.simd.and_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for u8x64<S> {
+impl<S: Simd> core::ops::BitAndAssign for u16x32<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_u8x64(*self, rhs);
+        *self = self.simd.and_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<u8> for u8x64<S> {
+impl<S: Simd> core::ops::BitAnd<u16> for u16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: u8) -> Self::Output {
-        self.simd.and_u8x64(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: u16) -> Self::Output {
+        self.simd.and_u16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<u8> for u8x64<S> {
+impl<S: Simd> core::ops::BitAndAssign<u16> for u16x32<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: u8) {
-        *self = self.simd.and_u8x64(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: u16) {
+        *self = self.simd.and_u16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<u8x64<S>> for u8 {
-    type Output = u8x64<S>;
+impl<S: Simd> core::ops::BitAnd<u16x32<S>> for u16 {
+    type Output = u16x32<S>;
     #[inline(always)]
-    fn bitand(self, rhs: u8x64<S>) -> Self::Output {
-        rhs.simd.and_u8x64(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: u16x32<S>) -> Self::Output {
+        rhs.simd.and_u16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for u8x64<S> {
+impl<S: Simd> core::ops::BitOr for u16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_u8x64(self, rhs)
+        self.simd.or_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for u8x64<S> {
+impl<S: Simd> core::ops::BitOrAssign for u16x32<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_u8x64(*self, rhs);
+        *self = self.simd.or_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<u8> for u8x64<S> {
+impl<S: Simd> core::ops::BitOr<u16> for u16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: u8) -> Self::Output {
-        self.simd.or_u8x64(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: u16) -> Self::Output {
+        self.simd.or_u16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<u8> for u8x64<S> {
+impl<S: Simd> core::ops::BitOrAssign<u16> for u16x32<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: u8) {
-        *self = self.simd.or_u8x64(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: u16) {
+        *self = self.simd.or_u16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<u8x64<S>> for u8 {
-    type Output = u8x64<S>;
+impl<S: Simd> core::ops::BitOr<u16x32<S>> for u16 {
+    type Output = u16x32<S>;
     #[inline(always)]
-    fn bitor(self, rhs: u8x64<S>) -> Self::Output {
-        rhs.simd.or_u8x64(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: u16x32<S>) -> Self::Output {
+        rhs.simd.or_u16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for u8x64<S> {
+impl<S: Simd> core::ops::BitXor for u16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_u8x64(self, rhs)
+        self.simd.xor_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for u8x64<S> {
+impl<S: Simd> core::ops::BitXorAssign for u16x32<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_u8x64(*self, rhs);
+        *self = self.simd.xor_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<u8> for u8x64<S> {
+impl<S: Simd> core::ops::BitXor<u16> for u16x32<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: u8) -> Self::Output {
-        self.simd.xor_u8x64(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: u16) -> Self::Output {
+        self.simd.xor_u16x32(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<u8> for u8x64<S> {
+impl<S: Simd> core::ops::BitXorAssign<u16> for u16x32<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: u8) {
-        *self = self.simd.xor_u8x64(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: u16) {
+        *self = self.simd.xor_u16x32(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<u8x64<S>> for u8 {
-    type Output = u8x64<S>;
+impl<S: Simd> core::ops::BitXor<u16x32<S>> for u16 {
+    type Output = u16x32<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: u8x64<S>) -> Self::Output {
-        rhs.simd.xor_u8x64(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: u16x32<S>) -> Self::Output {
+        rhs.simd.xor_u16x32(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for u8x64<S> {
+impl<S: Simd> core::ops::Not for u16x32<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_u8x64(self)
+        self.simd.not_u16x32(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for u8x64<S> {
+impl<S: Simd> core::ops::Shl<u32> for u16x32<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_u8x64(self, rhs)
+        self.simd.shl_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for u8x64<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for u16x32<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_u8x64(*self, rhs);
+        *self = self.simd.shl_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for u8x64<S> {
+impl<S: Simd> core::ops::Shl for u16x32<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_u8x64(self, rhs)
+        self.simd.shlv_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for u8x64<S> {
+impl<S: Simd> core::ops::ShlAssign for u16x32<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_u8x64(*self, rhs);
+        *self = self.simd.shlv_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for u8x64<S> {
+impl<S: Simd> core::ops::Shr<u32> for u16x32<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_u8x64(self, rhs)
+        self.simd.shr_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for u8x64<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for u16x32<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_u8x64(*self, rhs);
+        *self = self.simd.shr_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for u8x64<S> {
+impl<S: Simd> core::ops::Shr for u16x32<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_u8x64(self, rhs)
+        self.simd.shrv_u16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for u8x64<S> {
+impl<S: Simd> core::ops::ShrAssign for u16x32<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_u8x64(*self, rhs);
+        *self = self.simd.shrv_u16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd for mask8x64<S> {
+impl<S: Simd> core::ops::BitAnd for mask16x32<S> {
     type Output = Self;
     #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask8x64(self, rhs)
+        self.simd.and_mask16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for mask8x64<S> {
+impl<S: Simd> core::ops::BitAndAssign for mask16x32<S> {
     #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask8x64(*self, rhs);
+        *self = self.simd.and_mask16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr for mask8x64<S> {
+impl<S: Simd> core::ops::BitOr for mask16x32<S> {
     type Output = Self;
     #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask8x64(self, rhs)
+        self.simd.or_mask16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for mask8x64<S> {
+impl<S: Simd> core::ops::BitOrAssign for mask16x32<S> {
     #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask8x64(*self, rhs);
+        *self = self.simd.or_mask16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor for mask8x64<S> {
+impl<S: Simd> core::ops::BitXor for mask16x32<S> {
     type Output = Self;
     #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask8x64(self, rhs)
+        self.simd.xor_mask16x32(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for mask8x64<S> {
+impl<S: Simd> core::ops::BitXorAssign for mask16x32<S> {
     #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask8x64(*self, rhs);
+        *self = self.simd.xor_mask16x32(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Not for mask8x64<S> {
+impl<S: Simd> core::ops::Not for mask16x32<S> {
     type Output = Self;
     #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_mask8x64(self)
+        self.simd.not_mask16x32(self)
     }
 }
-impl<S: Simd> core::ops::Neg for i16x32<S> {
+impl<S: Simd> core::ops::Neg for i32x16<S> {
     type Output = Self;
     #[doc = "Negate each element of the vector, wrapping on overflow."]
     #[inline(always)]
     fn neg(self) -> Self::Output {
-        self.simd.neg_i16x32(self)
+        self.simd.neg_i32x16(self)
     }
 }
-impl<S: Simd> core::ops::Add for i16x32<S> {
+impl<S: Simd> core::ops::Add for i32x16<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_i16x32(self, rhs)
+        self.simd.add_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for i16x32<S> {
+impl<S: Simd> core::ops::AddAssign for i32x16<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_i16x32(*self, rhs);
+        *self = self.simd.add_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<i16> for i16x32<S> {
+impl<S: Simd> core::ops::Add<i32> for i32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: i16) -> Self::Output {
-        self.simd.add_i16x32(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: i32) -> Self::Output {
+        self.simd.add_i32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<i16> for i16x32<S> {
+impl<S: Simd> core::ops::AddAssign<i32> for i32x16<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: i16) {
-        *self = self.simd.add_i16x32(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: i32) {
+        *self = self.simd.add_i32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<i16x32<S>> for i16 {
-    type Output = i16x32<S>;
+impl<S: Simd> core::ops::Add<i32x16<S>> for i32 {
+    type Output = i32x16<S>;
     #[inline(always)]
-    fn add(self, rhs: i16x32<S>) -> Self::Output {
-        rhs.simd.add_i16x32(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: i32x16<S>) -> Self::Output {
+        rhs.simd.add_i32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for i16x32<S> {
+impl<S: Simd> core::ops::Sub for i32x16<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_i16x32(self, rhs)
+        self.simd.sub_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for i16x32<S> {
+impl<S: Simd> core::ops::SubAssign for i32x16<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_i16x32(*self, rhs);
+        *self = self.simd.sub_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<i16> for i16x32<S> {
+impl<S: Simd> core::ops::Sub<i32> for i32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: i16) -> Self::Output {
-        self.simd.sub_i16x32(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: i32) -> Self::Output {
+        self.simd.sub_i32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<i16> for i16x32<S> {
+impl<S: Simd> core::ops::SubAssign<i32> for i32x16<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: i16) {
-        *self = self.simd.sub_i16x32(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: i32) {
+        *self = self.simd.sub_i32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<i16x32<S>> for i16 {
-    type Output = i16x32<S>;
+impl<S: Simd> core::ops::Sub<i32x16<S>> for i32 {
+    type Output = i32x16<S>;
     #[inline(always)]
-    fn sub(self, rhs: i16x32<S>) -> Self::Output {
-        rhs.simd.sub_i16x32(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: i32x16<S>) -> Self::Output {
+        rhs.simd.sub_i32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for i16x32<S> {
+impl<S: Simd> core::ops::Mul for i32x16<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_i16x32(self, rhs)
+        self.simd.mul_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for i16x32<S> {
+impl<S: Simd> core::ops::MulAssign for i32x16<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_i16x32(*self, rhs);
+        *self = self.simd.mul_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<i16> for i16x32<S> {
+impl<S: Simd> core::ops::Mul<i32> for i32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: i16) -> Self::Output {
-        self.simd.mul_i16x32(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: i32) -> Self::Output {
+        self.simd.mul_i32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<i16> for i16x32<S> {
+impl<S: Simd> core::ops::MulAssign<i32> for i32x16<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: i16) {
-        *self = self.simd.mul_i16x32(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: i32) {
+        *self = self.simd.mul_i32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<i16x32<S>> for i16 {
-    type Output = i16x32<S>;
+impl<S: Simd> core::ops::Mul<i32x16<S>> for i32 {
+    type Output = i32x16<S>;
     #[inline(always)]
-    fn mul(self, rhs: i16x32<S>) -> Self::Output {
-        rhs.simd.mul_i16x32(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: i32x16<S>) -> Self::Output {
+        rhs.simd.mul_i32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for i16x32<S> {
+impl<S: Simd> core::ops::BitAnd for i32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_i16x32(self, rhs)
+        self.simd.and_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for i16x32<S> {
+impl<S: Simd> core::ops::BitAndAssign for i32x16<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_i16x32(*self, rhs);
+        *self = self.simd.and_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<i16> for i16x32<S> {
+impl<S: Simd> core::ops::BitAnd<i32> for i32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: i16) -> Self::Output {
-        self.simd.and_i16x32(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: i32) -> Self::Output {
+        self.simd.and_i32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<i16> for i16x32<S> {
+impl<S: Simd> core::ops::BitAndAssign<i32> for i32x16<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: i16) {
-        *self = self.simd.and_i16x32(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: i32) {
+        *self = self.simd.and_i32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<i16x32<S>> for i16 {
-    type Output = i16x32<S>;
+impl<S: Simd> core::ops::BitAnd<i32x16<S>> for i32 {
+    type Output = i32x16<S>;
     #[inline(always)]
-    fn bitand(self, rhs: i16x32<S>) -> Self::Output {
-        rhs.simd.and_i16x32(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: i32x16<S>) -> Self::Output {
+        rhs.simd.and_i32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for i16x32<S> {
+impl<S: Simd> core::ops::BitOr for i32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_i16x32(self, rhs)
+        self.simd.or_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for i16x32<S> {
+impl<S: Simd> core::ops::BitOrAssign for i32x16<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_i16x32(*self, rhs);
+        *self = self.simd.or_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<i16> for i16x32<S> {
+impl<S: Simd> core::ops::BitOr<i32> for i32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: i16) -> Self::Output {
-        self.simd.or_i16x32(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: i32) -> Self::Output {
+        self.simd.or_i32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<i16> for i16x32<S> {
+impl<S: Simd> core::ops::BitOrAssign<i32> for i32x16<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: i16) {
-        *self = self.simd.or_i16x32(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: i32) {
+        *self = self.simd.or_i32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<i16x32<S>> for i16 {
-    type Output = i16x32<S>;
+impl<S: Simd> core::ops::BitOr<i32x16<S>> for i32 {
+    type Output = i32x16<S>;
     #[inline(always)]
-    fn bitor(self, rhs: i16x32<S>) -> Self::Output {
-        rhs.simd.or_i16x32(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: i32x16<S>) -> Self::Output {
+        rhs.simd.or_i32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for i16x32<S> {
+impl<S: Simd> core::ops::BitXor for i32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_i16x32(self, rhs)
+        self.simd.xor_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for i16x32<S> {
+impl<S: Simd> core::ops::BitXorAssign for i32x16<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_i16x32(*self, rhs);
+        *self = self.simd.xor_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<i16> for i16x32<S> {
+impl<S: Simd> core::ops::BitXor<i32> for i32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: i16) -> Self::Output {
-        self.simd.xor_i16x32(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: i32) -> Self::Output {
+        self.simd.xor_i32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<i16> for i16x32<S> {
+impl<S: Simd> core::ops::BitXorAssign<i32> for i32x16<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: i16) {
-        *self = self.simd.xor_i16x32(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: i32) {
+        *self = self.simd.xor_i32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<i16x32<S>> for i16 {
-    type Output = i16x32<S>;
+impl<S: Simd> core::ops::BitXor<i32x16<S>> for i32 {
+    type Output = i32x16<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: i16x32<S>) -> Self::Output {
-        rhs.simd.xor_i16x32(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: i32x16<S>) -> Self::Output {
+        rhs.simd.xor_i32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for i16x32<S> {
+impl<S: Simd> core::ops::Not for i32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_i16x32(self)
+        self.simd.not_i32x16(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for i16x32<S> {
+impl<S: Simd> core::ops::Shl<u32> for i32x16<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_i16x32(self, rhs)
+        self.simd.shl_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for i16x32<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for i32x16<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_i16x32(*self, rhs);
+        *self = self.simd.shl_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for i16x32<S> {
+impl<S: Simd> core::ops::Shl for i32x16<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_i16x32(self, rhs)
+        self.simd.shlv_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for i16x32<S> {
+impl<S: Simd> core::ops::ShlAssign for i32x16<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_i16x32(*self, rhs);
+        *self = self.simd.shlv_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for i16x32<S> {
+impl<S: Simd> core::ops::Shr<u32> for i32x16<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_i16x32(self, rhs)
+        self.simd.shr_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for i16x32<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for i32x16<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_i16x32(*self, rhs);
+        *self = self.simd.shr_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for i16x32<S> {
+impl<S: Simd> core::ops::Shr for i32x16<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_i16x32(self, rhs)
+        self.simd.shrv_i32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for i16x32<S> {
+impl<S: Simd> core::ops::ShrAssign for i32x16<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_i16x32(*self, rhs);
+        *self = self.simd.shrv_i32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add for u16x32<S> {
+impl<S: Simd> core::ops::Add for u32x16<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_u16x32(self, rhs)
+        self.simd.add_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for u16x32<S> {
+impl<S: Simd> core::ops::AddAssign for u32x16<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_u16x32(*self, rhs);
+        *self = self.simd.add_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<u16> for u16x32<S> {
+impl<S: Simd> core::ops::Add<u32> for u32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: u16) -> Self::Output {
-        self.simd.add_u16x32(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: u32) -> Self::Output {
+        self.simd.add_u32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<u16> for u16x32<S> {
+impl<S: Simd> core::ops::AddAssign<u32> for u32x16<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: u16) {
-        *self = self.simd.add_u16x32(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: u32) {
+        *self = self.simd.add_u32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<u16x32<S>> for u16 {
-    type Output = u16x32<S>;
+impl<S: Simd> core::ops::Add<u32x16<S>> for u32 {
+    type Output = u32x16<S>;
     #[inline(always)]
-    fn add(self, rhs: u16x32<S>) -> Self::Output {
-        rhs.simd.add_u16x32(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: u32x16<S>) -> Self::Output {
+        rhs.simd.add_u32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for u16x32<S> {
+impl<S: Simd> core::ops::Sub for u32x16<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_u16x32(self, rhs)
+        self.simd.sub_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for u16x32<S> {
+impl<S: Simd> core::ops::SubAssign for u32x16<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_u16x32(*self, rhs);
+        *self = self.simd.sub_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<u16> for u16x32<S> {
+impl<S: Simd> core::ops::Sub<u32> for u32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: u16) -> Self::Output {
-        self.simd.sub_u16x32(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: u32) -> Self::Output {
+        self.simd.sub_u32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<u16> for u16x32<S> {
+impl<S: Simd> core::ops::SubAssign<u32> for u32x16<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: u16) {
-        *self = self.simd.sub_u16x32(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: u32) {
+        *self = self.simd.sub_u32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<u16x32<S>> for u16 {
-    type Output = u16x32<S>;
+impl<S: Simd> core::ops::Sub<u32x16<S>> for u32 {
+    type Output = u32x16<S>;
     #[inline(always)]
-    fn sub(self, rhs: u16x32<S>) -> Self::Output {
-        rhs.simd.sub_u16x32(self.simd_into(rhs.simd), rhs)
+    fn sub(self, rhs: u32x16<S>) -> Self::Output {
+        rhs.simd.sub_u32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for u16x32<S> {
+impl<S: Simd> core::ops::Mul for u32x16<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_u16x32(self, rhs)
+        self.simd.mul_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for u16x32<S> {
+impl<S: Simd> core::ops::MulAssign for u32x16<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_u16x32(*self, rhs);
+        *self = self.simd.mul_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<u16> for u16x32<S> {
+impl<S: Simd> core::ops::Mul<u32> for u32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: u16) -> Self::Output {
-        self.simd.mul_u16x32(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: u32) -> Self::Output {
+        self.simd.mul_u32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<u16> for u16x32<S> {
+impl<S: Simd> core::ops::MulAssign<u32> for u32x16<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: u16) {
-        *self = self.simd.mul_u16x32(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: u32) {
+        *self = self.simd.mul_u32x16(*self, rhs.simd_into(self.simd));
     }
-}
-impl<S: Simd> core::ops::Mul<u16x32<S>> for u16 {
-    type Output = u16x32<S>;
+}
+impl<S: Simd> core::ops::Mul<u32x16<S>> for u32 {
+    type Output = u32x16<S>;
     #[inline(always)]
-    fn mul(self, rhs: u16x32<S>) -> Self::Output {
-        rhs.simd.mul_u16x32(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: u32x16<S>) -> Self::Output {
+        rhs.simd.mul_u32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for u16x32<S> {
+impl<S: Simd> core::ops::BitAnd for u32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_u16x32(self, rhs)
+        self.simd.and_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for u16x32<S> {
+impl<S: Simd> core::ops::BitAndAssign for u32x16<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_u16x32(*self, rhs);
+        *self = self.simd.and_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<u16> for u16x32<S> {
+impl<S: Simd> core::ops::BitAnd<u32> for u32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: u16) -> Self::Output {
-        self.simd.and_u16x32(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: u32) -> Self::Output {
+        self.simd.and_u32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<u16> for u16x32<S> {
+impl<S: Simd> core::ops::BitAndAssign<u32> for u32x16<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: u16) {
-        *self = self.simd.and_u16x32(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: u32) {
+        *self = self.simd.and_u32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<u16x32<S>> for u16 {
-    type Output = u16x32<S>;
+impl<S: Simd> core::ops::BitAnd<u32x16<S>> for u32 {
+    type Output = u32x16<S>;
     #[inline(always)]
-    fn bitand(self, rhs: u16x32<S>) -> Self::Output {
-        rhs.simd.and_u16x32(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: u32x16<S>) -> Self::Output {
+        rhs.simd.and_u32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for u16x32<S> {
+impl<S: Simd> core::ops::BitOr for u32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_u16x32(self, rhs)
+        self.simd.or_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for u16x32<S> {
+impl<S: Simd> core::ops::BitOrAssign for u32x16<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_u16x32(*self, rhs);
+        *self = self.simd.or_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<u16> for u16x32<S> {
+impl<S: Simd> core::ops::BitOr<u32> for u32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: u16) -> Self::Output {
-        self.simd.or_u16x32(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: u32) -> Self::Output {
+        self.simd.or_u32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<u16> for u16x32<S> {
+impl<S: Simd> core::ops::BitOrAssign<u32> for u32x16<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: u16) {
-        *self = self.simd.or_u16x32(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: u32) {
+        *self = self.simd.or_u32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<u16x32<S>> for u16 {
-    type Output = u16x32<S>;
+impl<S: Simd> core::ops::BitOr<u32x16<S>> for u32 {
+    type Output = u32x16<S>;
     #[inline(always)]
-    fn bitor(self, rhs: u16x32<S>) -> Self::Output {
-        rhs.simd.or_u16x32(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: u32x16<S>) -> Self::Output {
+        rhs.simd.or_u32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for u16x32<S> {
+impl<S: Simd> core::ops::BitXor for u32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_u16x32(self, rhs)
+        self.simd.xor_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for u16x32<S> {
+impl<S: Simd> core::ops::BitXorAssign for u32x16<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_u16x32(*self, rhs);
+        *self = self.simd.xor_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<u16> for u16x32<S> {
+impl<S: Simd> core::ops::BitXor<u32> for u32x16<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: u16) -> Self::Output {
-        self.simd.xor_u16x32(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: u32) -> Self::Output {
+        self.simd.xor_u32x16(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<u16> for u16x32<S> {
+impl<S: Simd> core::ops::BitXorAssign<u32> for u32x16<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: u16) {
-        *self = self.simd.xor_u16x32(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: u32) {
+        *self = self.simd.xor_u32x16(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<u16x32<S>> for u16 {
-    type Output = u16x32<S>;
+impl<S: Simd> core::ops::BitXor<u32x16<S>> for u32 {
+    type Output = u32x16<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: u16x32<S>) -> Self::Output {
-        rhs.simd.xor_u16x32(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: u32x16<S>) -> Self::Output {
+        rhs.simd.xor_u32x16(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for u16x32<S> {
+impl<S: Simd> core::ops::Not for u32x16<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_u16x32(self)
+        self.simd.not_u32x16(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for u16x32<S> {
+impl<S: Simd> core::ops::Shl<u32> for u32x16<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_u16x32(self, rhs)
+        self.simd.shl_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for u16x32<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for u32x16<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_u16x32(*self, rhs);
+        *self = self.simd.shl_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for u16x32<S> {
+impl<S: Simd> core::ops::Shl for u32x16<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_u16x32(self, rhs)
+        self.simd.shlv_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for u16x32<S> {
+impl<S: Simd> core::ops::ShlAssign for u32x16<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_u16x32(*self, rhs);
+        *self = self.simd.shlv_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for u16x32<S> {
+impl<S: Simd> core::ops::Shr<u32> for u32x16<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_u16x32(self, rhs)
+        self.simd.shr_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for u16x32<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for u32x16<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_u16x32(*self, rhs);
+        *self = self.simd.shr_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for u16x32<S> {
+impl<S: Simd> core::ops::Shr for u32x16<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_u16x32(self, rhs)
+        self.simd.shrv_u32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for u16x32<S> {
+impl<S: Simd> core::ops::ShrAssign for u32x16<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_u16x32(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::BitAnd for mask16x32<S> {
-    type Output = Self;
-    #[doc = "Compute the logical AND of two masks."]
-    #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask16x32(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::BitAndAssign for mask16x32<S> {
-    #[doc = "Compute the logical AND of two masks."]
-    #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask16x32(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::BitOr for mask16x32<S> {
-    type Output = Self;
-    #[doc = "Compute the logical OR of two masks."]
-    #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask16x32(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::BitOrAssign for mask16x32<S> {
-    #[doc = "Compute the logical OR of two masks."]
-    #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask16x32(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::BitXor for mask16x32<S> {
-    type Output = Self;
-    #[doc = "Compute the logical XOR of two masks."]
-    #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask16x32(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::BitXorAssign for mask16x32<S> {
-    #[doc = "Compute the logical XOR of two masks."]
-    #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask16x32(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Not for mask16x32<S> {
-    type Output = Self;
-    #[doc = "Compute the logical NOT of the mask."]
-    #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_mask16x32(self)
-    }
-}
-impl<S: Simd> core::ops::Neg for i32x16<S> {
-    type Output = Self;
-    #[doc = "Negate each element of the vector, wrapping on overflow."]
-    #[inline(always)]
-    fn neg(self) -> Self::Output {
-        self.simd.neg_i32x16(self)
-    }
-}
-impl<S: Simd> core::ops::Add for i32x16<S> {
-    type Output = Self;
-    #[doc = "Add two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_i32x16(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::AddAssign for i32x16<S> {
-    #[doc = "Add two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_i32x16(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Add<i32> for i32x16<S> {
-    type Output = Self;
-    #[inline(always)]
-    fn add(self, rhs: i32) -> Self::Output {
-        self.simd.add_i32x16(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::AddAssign<i32> for i32x16<S> {
-    #[inline(always)]
-    fn add_assign(&mut self, rhs: i32) {
-        *self = self.simd.add_i32x16(*self, rhs.simd_into(self.simd));
-    }
-}
-impl<S: Simd> core::ops::Add<i32x16<S>> for i32 {
-    type Output = i32x16<S>;
-    #[inline(always)]
-    fn add(self, rhs: i32x16<S>) -> Self::Output {
-        rhs.simd.add_i32x16(self.simd_into(rhs.simd), rhs)
-    }
-}
-impl<S: Simd> core::ops::Sub for i32x16<S> {
-    type Output = Self;
-    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_i32x16(self, rhs)
-    }
-}
-impl<S: Simd> core::ops::SubAssign for i32x16<S> {
-    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
-    #[inline(always)]
-    fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_i32x16(*self, rhs);
-    }
-}
-impl<S: Simd> core::ops::Sub<i32> for i32x16<S> {
-    type Output = Self;
-    #[inline(always)]
-    fn sub(self, rhs: i32) -> Self::Output {
-        self.simd.sub_i32x16(self, rhs.simd_into(self.simd))
-    }
-}
-impl<S: Simd> core::ops::SubAssign<i32> for i32x16<S> {
-    #[inline(always)]
-    fn sub_assign(&mut self, rhs: i32) {
-        *self = self.simd.sub_i32x16(*self, rhs.simd_into(self.simd));
-    }
-}
-impl<S: Simd> core::ops::Sub<i32x16<S>> for i32 {
-    type Output = i32x16<S>;
-    #[inline(always)]
-    fn sub(self, rhs: i32x16<S>) -> Self::Output {
-        rhs.simd.sub_i32x16(self.simd_into(rhs.simd), rhs)
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_u32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul for i32x16<S> {
+impl<S: Simd> core::ops::BitAnd for mask32x16<S> {
     type Output = Self;
-    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
-    fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_i32x16(self, rhs)
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_mask32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for i32x16<S> {
-    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+impl<S: Simd> core::ops::BitAndAssign for mask32x16<S> {
+    #[doc = "Compute the logical AND of two masks."]
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_i32x16(*self, rhs);
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_mask32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<i32> for i32x16<S> {
+impl<S: Simd> core::ops::BitOr for mask32x16<S> {
     type Output = Self;
+    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn mul(self, rhs: i32) -> Self::Output {
-        self.simd.mul_i32x16(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_mask32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign<i32> for i32x16<S> {
+impl<S: Simd> core::ops::BitOrAssign for mask32x16<S> {
+    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: i32) {
-        *self = self.simd.mul_i32x16(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_mask32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<i32x16<S>> for i32 {
-    type Output = i32x16<S>;
+impl<S: Simd> core::ops::BitXor for mask32x16<S> {
+    type Output = Self;
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn mul(self, rhs: i32x16<S>) -> Self::Output {
-        rhs.simd.mul_i32x16(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_mask32x16(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for i32x16<S> {
-    type Output = Self;
-    #[doc = "Compute the bitwise AND of two vectors."]
+impl<S: Simd> core::ops::BitXorAssign for mask32x16<S> {
+    #[doc = "Compute the logical XOR of two masks."]
     #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_i32x16(self, rhs)
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_mask32x16(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for i32x16<S> {
-    #[doc = "Compute the bitwise AND of two vectors."]
+impl<S: Simd> core::ops::Not for mask32x16<S> {
+    type Output = Self;
+    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_i32x16(*self, rhs);
+    fn not(self) -> Self::Output {
+        self.simd.not_mask32x16(self)
     }
 }
-impl<S: Simd> core::ops::BitAnd<i32> for i32x16<S> {
+impl<S: Simd> core::ops::Neg for f64x8<S> {
     type Output = Self;
+    #[doc = "Negate each element of the vector."]
     #[inline(always)]
-    fn bitand(self, rhs: i32) -> Self::Output {
-        self.simd.and_i32x16(self, rhs.simd_into(self.simd))
+    fn neg(self) -> Self::Output {
+        self.simd.neg_f64x8(self)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<i32> for i32x16<S> {
+impl<S: Simd> core::ops::Add for f64x8<S> {
+    type Output = Self;
+    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: i32) {
-        *self = self.simd.and_i32x16(*self, rhs.simd_into(self.simd));
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_f64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd<i32x16<S>> for i32 {
-    type Output = i32x16<S>;
+impl<S: Simd> core::ops::AddAssign for f64x8<S> {
+    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn bitand(self, rhs: i32x16<S>) -> Self::Output {
-        rhs.simd.and_i32x16(self.simd_into(rhs.simd), rhs)
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_f64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr for i32x16<S> {
+impl<S: Simd> core::ops::Add<f64> for f64x8<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_i32x16(self, rhs)
+    fn add(self, rhs: f64) -> Self::Output {
+        self.simd.add_f64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for i32x16<S> {
-    #[doc = "Compute the bitwise OR of two vectors."]
+impl<S: Simd> core::ops::AddAssign<f64> for f64x8<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_i32x16(*self, rhs);
+    fn add_assign(&mut self, rhs: f64) {
+        *self = self.simd.add_f64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<i32> for i32x16<S> {
-    type Output = Self;
+impl<S: Simd> core::ops::Add<f64x8<S>> for f64 {
+    type Output = f64x8<S>;
     #[inline(always)]
-    fn bitor(self, rhs: i32) -> Self::Output {
-        self.simd.or_i32x16(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: f64x8<S>) -> Self::Output {
+        rhs.simd.add_f64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<i32> for i32x16<S> {
+impl<S: Simd> core::ops::Sub for f64x8<S> {
+    type Output = Self;
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: i32) {
-        *self = self.simd.or_i32x16(*self, rhs.simd_into(self.simd));
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_f64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr<i32x16<S>> for i32 {
-    type Output = i32x16<S>;
+impl<S: Simd> core::ops::SubAssign for f64x8<S> {
+    #[doc = "Subtract two vectors element-wise."]
     #[inline(always)]
-    fn bitor(self, rhs: i32x16<S>) -> Self::Output {
-        rhs.simd.or_i32x16(self.simd_into(rhs.simd), rhs)
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_f64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor for i32x16<S> {
+impl<S: Simd> core::ops::Sub<f64> for f64x8<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_i32x16(self, rhs)
+    fn sub(self, rhs: f64) -> Self::Output {
+        self.simd.sub_f64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for i32x16<S> {
-    #[doc = "Compute the bitwise XOR of two vectors."]
+impl<S: Simd> core::ops::SubAssign<f64> for f64x8<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_i32x16(*self, rhs);
+    fn sub_assign(&mut self, rhs: f64) {
+        *self = self.simd.sub_f64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<i32> for i32x16<S> {
-    type Output = Self;
+impl<S: Simd> core::ops::Sub<f64x8<S>> for f64 {
+    type Output = f64x8<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: i32) -> Self::Output {
-        self.simd.xor_i32x16(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: f64x8<S>) -> Self::Output {
+        rhs.simd.sub_f64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<i32> for i32x16<S> {
+impl<S: Simd> core::ops::Mul for f64x8<S> {
+    type Output = Self;
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: i32) {
-        *self = self.simd.xor_i32x16(*self, rhs.simd_into(self.simd));
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_f64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor<i32x16<S>> for i32 {
-    type Output = i32x16<S>;
+impl<S: Simd> core::ops::MulAssign for f64x8<S> {
+    #[doc = "Multiply two vectors element-wise."]
     #[inline(always)]
-    fn bitxor(self, rhs: i32x16<S>) -> Self::Output {
-        rhs.simd.xor_i32x16(self.simd_into(rhs.simd), rhs)
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_f64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Not for i32x16<S> {
+impl<S: Simd> core::ops::Mul<f64> for f64x8<S> {
     type Output = Self;
-    #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_i32x16(self)
+    fn mul(self, rhs: f64) -> Self::Output {
+        self.simd.mul_f64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for i32x16<S> {
-    type Output = Self;
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+impl<S: Simd> core::ops::MulAssign<f64> for f64x8<S> {
     #[inline(always)]
-    fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_i32x16(self, rhs)
+    fn mul_assign(&mut self, rhs: f64) {
+        *self = self.simd.mul_f64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for i32x16<S> {
+impl<S: Simd> core::ops::Mul<f64x8<S>> for f64 {
+    type Output = f64x8<S>;
     #[inline(always)]
-    fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_i32x16(*self, rhs);
+    fn mul(self, rhs: f64x8<S>) -> Self::Output {
+        rhs.simd.mul_f64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Shl for i32x16<S> {
+impl<S: Simd> core::ops::Div for f64x8<S> {
     type Output = Self;
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_i32x16(self, rhs)
+    fn div(self, rhs: Self) -> Self::Output {
+        self.simd.div_f64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for i32x16<S> {
-    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::DivAssign for f64x8<S> {
+    #[doc = "Divide two vectors element-wise."]
     #[inline(always)]
-    fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_i32x16(*self, rhs);
+    fn div_assign(&mut self, rhs: Self) {
+        *self = self.simd.div_f64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for i32x16<S> {
+impl<S: Simd> core::ops::Div<f64> for f64x8<S> {
     type Output = Self;
-    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
-    fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_i32x16(self, rhs)
+    fn div(self, rhs: f64) -> Self::Output {
+        self.simd.div_f64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for i32x16<S> {
+impl<S: Simd> core::ops::DivAssign<f64> for f64x8<S> {
     #[inline(always)]
-    fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_i32x16(*self, rhs);
+    fn div_assign(&mut self, rhs: f64) {
+        *self = self.simd.div_f64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Shr for i32x16<S> {
-    type Output = Self;
-    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::Div<f64x8<S>> for f64 {
+    type Output = f64x8<S>;
     #[inline(always)]
-    fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_i32x16(self, rhs)
+    fn div(self, rhs: f64x8<S>) -> Self::Output {
+        rhs.simd.div_f64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for i32x16<S> {
-    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+impl<S: Simd> core::ops::Neg for i64x8<S> {
+    type Output = Self;
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
     #[inline(always)]
-    fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_i32x16(*self, rhs);
+    fn neg(self) -> Self::Output {
+        self.simd.neg_i64x8(self)
     }
 }
-impl<S: Simd> core::ops::Add for u32x16<S> {
+impl<S: Simd> core::ops::Add for i64x8<S> {
     type Output = Self;
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_u32x16(self, rhs)
+        self.simd.add_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign for u32x16<S> {
+impl<S: Simd> core::ops::AddAssign for i64x8<S> {
     #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_u32x16(*self, rhs);
+        *self = self.simd.add_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<u32> for u32x16<S> {
+impl<S: Simd> core::ops::Add<i64> for i64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: u32) -> Self::Output {
-        self.simd.add_u32x16(self, rhs.simd_into(self.simd))
+    fn add(self, rhs: i64) -> Self::Output {
+        self.simd.add_i64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::AddAssign<i64> for i64x8<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: u32) {
-        *self = self.simd.add_u32x16(*self, rhs.simd_into(self.simd));
+    fn add_assign(&mut self, rhs: i64) {
+        *self = self.simd.add_i64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<u32x16<S>> for u32 {
-    type Output = u32x16<S>;
+impl<S: Simd> core::ops::Add<i64x8<S>> for i64 {
+    type Output = i64x8<S>;
     #[inline(always)]
-    fn add(self, rhs: u32x16<S>) -> Self::Output {
-        rhs.simd.add_u32x16(self.simd_into(rhs.simd), rhs)
+    fn add(self, rhs: i64x8<S>) -> Self::Output {
+        rhs.simd.add_i64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Sub for u32x16<S> {
+impl<S: Simd> core::ops::Sub for i64x8<S> {
     type Output = Self;
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_u32x16(self, rhs)
+        self.simd.sub_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for u32x16<S> {
+impl<S: Simd> core::ops::SubAssign for i64x8<S> {
     #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_u32x16(*self, rhs);
+        *self = self.simd.sub_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<u32> for u32x16<S> {
+impl<S: Simd> core::ops::Sub<i64> for i64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: u32) -> Self::Output {
-        self.simd.sub_u32x16(self, rhs.simd_into(self.simd))
+    fn sub(self, rhs: i64) -> Self::Output {
+        self.simd.sub_i64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::SubAssign<i64> for i64x8<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: u32) {
-        *self = self.simd.sub_u32x16(*self, rhs.simd_into(self.simd));
+    fn sub_assign(&mut self, rhs: i64) {
+        *self = self.simd.sub_i64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<u32x16<S>> for u32 {
-    type Output = u32x16<S>;
-    #[inline(always)]
-    fn sub(self, rhs: u32x16<S>) -> Self::Output {
-        rhs.simd.sub_u32x16(self.simd_into(rhs.simd), rhs)
+impl<S: Simd> core::ops::Sub<i64x8<S>> for i64 {
+    type Output = i64x8<S>;
+    #[inline(always)]
+    fn sub(self, rhs: i64x8<S>) -> Self::Output {
+        rhs.simd.sub_i64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for u32x16<S> {
+impl<S: Simd> core::ops::Mul for i64x8<S> {
     type Output = Self;
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_u32x16(self, rhs)
+        self.simd.mul_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for u32x16<S> {
+impl<S: Simd> core::ops::MulAssign for i64x8<S> {
     #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
     fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_u32x16(*self, rhs);
+        *self = self.simd.mul_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<u32> for u32x16<S> {
+impl<S: Simd> core::ops::Mul<i64> for i64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: u32) -> Self::Output {
-        self.simd.mul_u32x16(self, rhs.simd_into(self.simd))
+    fn mul(self, rhs: i64) -> Self::Output {
+        self.simd.mul_i64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::MulAssign<i64> for i64x8<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: u32) {
-        *self = self.simd.mul_u32x16(*self, rhs.simd_into(self.simd));
+    fn mul_assign(&mut self, rhs: i64) {
+        *self = self.simd.mul_i64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<u32x16<S>> for u32 {
-    type Output = u32x16<S>;
+impl<S: Simd> core::ops::Mul<i64x8<S>> for i64 {
+    type Output = i64x8<S>;
     #[inline(always)]
-    fn mul(self, rhs: u32x16<S>) -> Self::Output {
-        rhs.simd.mul_u32x16(self.simd_into(rhs.simd), rhs)
+    fn mul(self, rhs: i64x8<S>) -> Self::Output {
+        rhs.simd.mul_i64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitAnd for u32x16<S> {
+impl<S: Simd> core::ops::BitAnd for i64x8<S> {
     type Output = Self;
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_u32x16(self, rhs)
+        self.simd.and_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for u32x16<S> {
+impl<S: Simd> core::ops::BitAndAssign for i64x8<S> {
     #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
     fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_u32x16(*self, rhs);
+        *self = self.simd.and_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd<u32> for u32x16<S> {
+impl<S: Simd> core::ops::BitAnd<i64> for i64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitand(self, rhs: u32) -> Self::Output {
-        self.simd.and_u32x16(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: i64) -> Self::Output {
+        self.simd.and_i64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitAndAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::BitAndAssign<i64> for i64x8<S> {
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: u32) {
-        *self = self.simd.and_u32x16(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: i64) {
+        *self = self.simd.and_i64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitAnd<u32x16<S>> for u32 {
-    type Output = u32x16<S>;
+impl<S: Simd> core::ops::BitAnd<i64x8<S>> for i64 {
+    type Output = i64x8<S>;
     #[inline(always)]
-    fn bitand(self, rhs: u32x16<S>) -> Self::Output {
-        rhs.simd.and_u32x16(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: i64x8<S>) -> Self::Output {
+        rhs.simd.and_i64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitOr for u32x16<S> {
+impl<S: Simd> core::ops::BitOr for i64x8<S> {
     type Output = Self;
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_u32x16(self, rhs)
+        self.simd.or_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for u32x16<S> {
+impl<S: Simd> core::ops::BitOrAssign for i64x8<S> {
     #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
     fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_u32x16(*self, rhs);
+        *self = self.simd.or_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr<u32> for u32x16<S> {
+impl<S: Simd> core::ops::BitOr<i64> for i64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitor(self, rhs: u32) -> Self::Output {
-        self.simd.or_u32x16(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: i64) -> Self::Output {
+        self.simd.or_i64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::BitOrAssign<i64> for i64x8<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: u32) {
-        *self = self.simd.or_u32x16(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: i64) {
+        *self = self.simd.or_i64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitOr<u32x16<S>> for u32 {
-    type Output = u32x16<S>;
+impl<S: Simd> core::ops::BitOr<i64x8<S>> for i64 {
+    type Output = i64x8<S>;
     #[inline(always)]
-    fn bitor(self, rhs: u32x16<S>) -> Self::Output {
-        rhs.simd.or_u32x16(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: i64x8<S>) -> Self::Output {
+        rhs.simd.or_i64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::BitXor for u32x16<S> {
+impl<S: Simd> core::ops::BitXor for i64x8<S> {
     type Output = Self;
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_u32x16(self, rhs)
+        self.simd.xor_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for u32x16<S> {
+impl<S: Simd> core::ops::BitXorAssign for i64x8<S> {
     #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
     fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_u32x16(*self, rhs);
+        *self = self.simd.xor_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitXor<u32> for u32x16<S> {
+impl<S: Simd> core::ops::BitXor<i64> for i64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn bitxor(self, rhs: u32) -> Self::Output {
-        self.simd.xor_u32x16(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: i64) -> Self::Output {
+        self.simd.xor_i64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitXorAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::BitXorAssign<i64> for i64x8<S> {
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: u32) {
-        *self = self.simd.xor_u32x16(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: i64) {
+        *self = self.simd.xor_i64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor<u32x16<S>> for u32 {
-    type Output = u32x16<S>;
+impl<S: Simd> core::ops::BitXor<i64x8<S>> for i64 {
+    type Output = i64x8<S>;
     #[inline(always)]
-    fn bitxor(self, rhs: u32x16<S>) -> Self::Output {
-        rhs.simd.xor_u32x16(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: i64x8<S>) -> Self::Output {
+        rhs.simd.xor_i64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Not for u32x16<S> {
+impl<S: Simd> core::ops::Not for i64x8<S> {
     type Output = Self;
     #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
     fn not(self) -> Self::Output {
-        self.simd.not_u32x16(self)
+        self.simd.not_i64x8(self)
     }
 }
-impl<S: Simd> core::ops::Shl<u32> for u32x16<S> {
+impl<S: Simd> core::ops::Shl<u32> for i64x8<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
     fn shl(self, rhs: u32) -> Self::Output {
-        self.simd.shl_u32x16(self, rhs)
+        self.simd.shl_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for i64x8<S> {
     #[inline(always)]
     fn shl_assign(&mut self, rhs: u32) {
-        *self = self.simd.shl_u32x16(*self, rhs);
+        *self = self.simd.shl_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shl for u32x16<S> {
+impl<S: Simd> core::ops::Shl for i64x8<S> {
     type Output = Self;
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl(self, rhs: Self) -> Self::Output {
-        self.simd.shlv_u32x16(self, rhs)
+        self.simd.shlv_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShlAssign for u32x16<S> {
+impl<S: Simd> core::ops::ShlAssign for i64x8<S> {
     #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shl_assign(&mut self, rhs: Self) {
-        *self = self.simd.shlv_u32x16(*self, rhs);
+        *self = self.simd.shlv_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr<u32> for u32x16<S> {
+impl<S: Simd> core::ops::Shr<u32> for i64x8<S> {
     type Output = Self;
     #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
     fn shr(self, rhs: u32) -> Self::Output {
-        self.simd.shr_u32x16(self, rhs)
+        self.simd.shr_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign<u32> for u32x16<S> {
+impl<S: Simd> core::ops::ShrAssign<u32> for i64x8<S> {
     #[inline(always)]
     fn shr_assign(&mut self, rhs: u32) {
-        *self = self.simd.shr_u32x16(*self, rhs);
+        *self = self.simd.shr_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Shr for u32x16<S> {
+impl<S: Simd> core::ops::Shr for i64x8<S> {
     type Output = Self;
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr(self, rhs: Self) -> Self::Output {
-        self.simd.shrv_u32x16(self, rhs)
+        self.simd.shrv_i64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::ShrAssign for u32x16<S> {
+impl<S: Simd> core::ops::ShrAssign for i64x8<S> {
     #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
     fn shr_assign(&mut self, rhs: Self) {
-        *self = self.simd.shrv_u32x16(*self, rhs);
+        *self = self.simd.shrv_i64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitAnd for mask32x16<S> {
+impl<S: Simd> core::ops::Add for u64x8<S> {
     type Output = Self;
-    #[doc = "Compute the logical AND of two masks."]
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn bitand(self, rhs: Self) -> Self::Output {
-        self.simd.and_mask32x16(self, rhs)
+    fn add(self, rhs: Self) -> Self::Output {
+        self.simd.add_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitAndAssign for mask32x16<S> {
-    #[doc = "Compute the logical AND of two masks."]
+impl<S: Simd> core::ops::AddAssign for u64x8<S> {
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn bitand_assign(&mut self, rhs: Self) {
-        *self = self.simd.and_mask32x16(*self, rhs);
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.simd.add_u64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::BitOr for mask32x16<S> {
+impl<S: Simd> core::ops::Add<u64> for u64x8<S> {
     type Output = Self;
-    #[doc = "Compute the logical OR of two masks."]
     #[inline(always)]
-    fn bitor(self, rhs: Self) -> Self::Output {
-        self.simd.or_mask32x16(self, rhs)
+    fn add(self, rhs: u64) -> Self::Output {
+        self.simd.add_u64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::BitOrAssign for mask32x16<S> {
-    #[doc = "Compute the logical OR of two masks."]
+impl<S: Simd> core::ops::AddAssign<u64> for u64x8<S> {
     #[inline(always)]
-    fn bitor_assign(&mut self, rhs: Self) {
-        *self = self.simd.or_mask32x16(*self, rhs);
+    fn add_assign(&mut self, rhs: u64) {
+        *self = self.simd.add_u64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::BitXor for mask32x16<S> {
+impl<S: Simd> core::ops::Add<u64x8<S>> for u64 {
+    type Output = u64x8<S>;
+    #[inline(always)]
+    fn add(self, rhs: u64x8<S>) -> Self::Output {
+        rhs.simd.add_u64x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Sub for u64x8<S> {
     type Output = Self;
-    #[doc = "Compute the logical XOR of two masks."]
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn bitxor(self, rhs: Self) -> Self::Output {
-        self.simd.xor_mask32x16(self, rhs)
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.simd.sub_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::BitXorAssign for mask32x16<S> {
-    #[doc = "Compute the logical XOR of two masks."]
+impl<S: Simd> core::ops::SubAssign for u64x8<S> {
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn bitxor_assign(&mut self, rhs: Self) {
-        *self = self.simd.xor_mask32x16(*self, rhs);
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.simd.sub_u64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Not for mask32x16<S> {
+impl<S: Simd> core::ops::Sub<u64> for u64x8<S> {
     type Output = Self;
-    #[doc = "Compute the logical NOT of the mask."]
     #[inline(always)]
-    fn not(self) -> Self::Output {
-        self.simd.not_mask32x16(self)
+    fn sub(self, rhs: u64) -> Self::Output {
+        self.simd.sub_u64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Neg for f64x8<S> {
+impl<S: Simd> core::ops::SubAssign<u64> for u64x8<S> {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: u64) {
+        *self = self.simd.sub_u64x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::Sub<u64x8<S>> for u64 {
+    type Output = u64x8<S>;
+    #[inline(always)]
+    fn sub(self, rhs: u64x8<S>) -> Self::Output {
+        rhs.simd.sub_u64x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::Mul for u64x8<S> {
     type Output = Self;
-    #[doc = "Negate each element of the vector."]
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
     #[inline(always)]
-    fn neg(self) -> Self::Output {
-        self.simd.neg_f64x8(self)
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.simd.mul_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Add for f64x8<S> {
+impl<S: Simd> core::ops::MulAssign for u64x8<S> {
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = self.simd.mul_u64x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Mul<u64> for u64x8<S> {
     type Output = Self;
-    #[doc = "Add two vectors element-wise."]
     #[inline(always)]
-    fn add(self, rhs: Self) -> Self::Output {
-        self.simd.add_f64x8(self, rhs)
+    fn mul(self, rhs: u64) -> Self::Output {
+        self.simd.mul_u64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::AddAssign for f64x8<S> {
-    #[doc = "Add two vectors element-wise."]
+impl<S: Simd> core::ops::MulAssign<u64> for u64x8<S> {
     #[inline(always)]
-    fn add_assign(&mut self, rhs: Self) {
-        *self = self.simd.add_f64x8(*self, rhs);
+    fn mul_assign(&mut self, rhs: u64) {
+        *self = self.simd.mul_u64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Add<f64> for f64x8<S> {
+impl<S: Simd> core::ops::Mul<u64x8<S>> for u64 {
+    type Output = u64x8<S>;
+    #[inline(always)]
+    fn mul(self, rhs: u64x8<S>) -> Self::Output {
+        rhs.simd.mul_u64x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitAnd for u64x8<S> {
     type Output = Self;
+    #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
-    fn add(self, rhs: f64) -> Self::Output {
-        self.simd.add_f64x8(self, rhs.simd_into(self.simd))
+    fn bitand(self, rhs: Self) -> Self::Output {
+        self.simd.and_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::AddAssign<f64> for f64x8<S> {
+impl<S: Simd> core::ops::BitAndAssign for u64x8<S> {
+    #[doc = "Compute the bitwise AND of two vectors."]
     #[inline(always)]
-    fn add_assign(&mut self, rhs: f64) {
-        *self = self.simd.add_f64x8(*self, rhs.simd_into(self.simd));
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = self.simd.and_u64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Add<f64x8<S>> for f64 {
-    type Output = f64x8<S>;
+impl<S: Simd> core::ops::BitAnd<u64> for u64x8<S> {
+    type Output = Self;
     #[inline(always)]
-    fn add(self, rhs: f64x8<S>) -> Self::Output {
-        rhs.simd.add_f64x8(self.simd_into(rhs.simd), rhs)
+    fn bitand(self, rhs: u64) -> Self::Output {
+        self.simd.and_u64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::Sub for f64x8<S> {
+impl<S: Simd> core::ops::BitAndAssign<u64> for u64x8<S> {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: u64) {
+        *self = self.simd.and_u64x8(*self, rhs.simd_into(self.simd));
+    }
+}
+impl<S: Simd> core::ops::BitAnd<u64x8<S>> for u64 {
+    type Output = u64x8<S>;
+    #[inline(always)]
+    fn bitand(self, rhs: u64x8<S>) -> Self::Output {
+        rhs.simd.and_u64x8(self.simd_into(rhs.simd), rhs)
+    }
+}
+impl<S: Simd> core::ops::BitOr for u64x8<S> {
     type Output = Self;
-    #[doc = "Subtract two vectors element-wise."]
+    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.simd.sub_f64x8(self, rhs)
+    fn bitor(self, rhs: Self) -> Self::Output {
+        self.simd.or_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::SubAssign for f64x8<S> {
-    #[doc = "Subtract two vectors element-wise."]
+impl<S: Simd> core::ops::BitOrAssign for u64x8<S> {
+    #[doc = "Compute the bitwise OR of two vectors."]
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: Self) {
-        *self = self.simd.sub_f64x8(*self, rhs);
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = self.simd.or_u64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Sub<f64> for f64x8<S> {
+impl<S: Simd> core::ops::BitOr<u64> for u64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn sub(self, rhs: f64) -> Self::Output {
-        self.simd.sub_f64x8(self, rhs.simd_into(self.simd))
+    fn bitor(self, rhs: u64) -> Self::Output {
+        self.simd.or_u64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::SubAssign<f64> for f64x8<S> {
+impl<S: Simd> core::ops::BitOrAssign<u64> for u64x8<S> {
     #[inline(always)]
-    fn sub_assign(&mut self, rhs: f64) {
-        *self = self.simd.sub_f64x8(*self, rhs.simd_into(self.simd));
+    fn bitor_assign(&mut self, rhs: u64) {
+        *self = self.simd.or_u64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Sub<f64x8<S>> for f64 {
-    type Output = f64x8<S>;
+impl<S: Simd> core::ops::BitOr<u64x8<S>> for u64 {
+    type Output = u64x8<S>;
     #[inline(always)]
-    fn sub(self, rhs: f64x8<S>) -> Self::Output {
-        rhs.simd.sub_f64x8(self.simd_into(rhs.simd), rhs)
+    fn bitor(self, rhs: u64x8<S>) -> Self::Output {
+        rhs.simd.or_u64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Mul for f64x8<S> {
+impl<S: Simd> core::ops::BitXor for u64x8<S> {
     type Output = Self;
-    #[doc = "Multiply two vectors element-wise."]
+    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn mul(self, rhs: Self) -> Self::Output {
-        self.simd.mul_f64x8(self, rhs)
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        self.simd.xor_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::MulAssign for f64x8<S> {
-    #[doc = "Multiply two vectors element-wise."]
+impl<S: Simd> core::ops::BitXorAssign for u64x8<S> {
+    #[doc = "Compute the bitwise XOR of two vectors."]
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: Self) {
-        *self = self.simd.mul_f64x8(*self, rhs);
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = self.simd.xor_u64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Mul<f64> for f64x8<S> {
+impl<S: Simd> core::ops::BitXor<u64> for u64x8<S> {
     type Output = Self;
     #[inline(always)]
-    fn mul(self, rhs: f64) -> Self::Output {
-        self.simd.mul_f64x8(self, rhs.simd_into(self.simd))
+    fn bitxor(self, rhs: u64) -> Self::Output {
+        self.simd.xor_u64x8(self, rhs.simd_into(self.simd))
     }
 }
-impl<S: Simd> core::ops::MulAssign<f64> for f64x8<S> {
+impl<S: Simd> core::ops::BitXorAssign<u64> for u64x8<S> {
     #[inline(always)]
-    fn mul_assign(&mut self, rhs: f64) {
-        *self = self.simd.mul_f64x8(*self, rhs.simd_into(self.simd));
+    fn bitxor_assign(&mut self, rhs: u64) {
+        *self = self.simd.xor_u64x8(*self, rhs.simd_into(self.simd));
     }
 }
-impl<S: Simd> core::ops::Mul<f64x8<S>> for f64 {
-    type Output = f64x8<S>;
+impl<S: Simd> core::ops::BitXor<u64x8<S>> for u64 {
+    type Output = u64x8<S>;
     #[inline(always)]
-    fn mul(self, rhs: f64x8<S>) -> Self::Output {
-        rhs.simd.mul_f64x8(self.simd_into(rhs.simd), rhs)
+    fn bitxor(self, rhs: u64x8<S>) -> Self::Output {
+        rhs.simd.xor_u64x8(self.simd_into(rhs.simd), rhs)
     }
 }
-impl<S: Simd> core::ops::Div for f64x8<S> {
+impl<S: Simd> core::ops::Not for u64x8<S> {
     type Output = Self;
-    #[doc = "Divide two vectors element-wise."]
+    #[doc = "Compute the bitwise NOT of the vector."]
     #[inline(always)]
-    fn div(self, rhs: Self) -> Self::Output {
-        self.simd.div_f64x8(self, rhs)
+    fn not(self) -> Self::Output {
+        self.simd.not_u64x8(self)
     }
 }
-impl<S: Simd> core::ops::DivAssign for f64x8<S> {
-    #[doc = "Divide two vectors element-wise."]
+impl<S: Simd> core::ops::Shl<u32> for u64x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
     #[inline(always)]
-    fn div_assign(&mut self, rhs: Self) {
-        *self = self.simd.div_f64x8(*self, rhs);
+    fn shl(self, rhs: u32) -> Self::Output {
+        self.simd.shl_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::Div<f64> for f64x8<S> {
+impl<S: Simd> core::ops::ShlAssign<u32> for u64x8<S> {
+    #[inline(always)]
+    fn shl_assign(&mut self, rhs: u32) {
+        *self = self.simd.shl_u64x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shl for u64x8<S> {
     type Output = Self;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn div(self, rhs: f64) -> Self::Output {
-        self.simd.div_f64x8(self, rhs.simd_into(self.simd))
+    fn shl(self, rhs: Self) -> Self::Output {
+        self.simd.shlv_u64x8(self, rhs)
     }
 }
-impl<S: Simd> core::ops::DivAssign<f64> for f64x8<S> {
+impl<S: Simd> core::ops::ShlAssign for u64x8<S> {
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
     #[inline(always)]
-    fn div_assign(&mut self, rhs: f64) {
-        *self = self.simd.div_f64x8(*self, rhs.simd_into(self.simd));
+    fn shl_assign(&mut self, rhs: Self) {
+        *self = self.simd.shlv_u64x8(*self, rhs);
     }
 }
-impl<S: Simd> core::ops::Div<f64x8<S>> for f64 {
-    type Output = f64x8<S>;
+impl<S: Simd> core::ops::Shr<u32> for u64x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
     #[inline(always)]
-    fn div(self, rhs: f64x8<S>) -> Self::Output {
-        rhs.simd.div_f64x8(self.simd_into(rhs.simd), rhs)
+    fn shr(self, rhs: u32) -> Self::Output {
+        self.simd.shr_u64x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign<u32> for u64x8<S> {
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: u32) {
+        *self = self.simd.shr_u64x8(*self, rhs);
+    }
+}
+impl<S: Simd> core::ops::Shr for u64x8<S> {
+    type Output = Self;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr(self, rhs: Self) -> Self::Output {
+        self.simd.shrv_u64x8(self, rhs)
+    }
+}
+impl<S: Simd> core::ops::ShrAssign for u64x8<S> {
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    #[inline(always)]
+    fn shr_assign(&mut self, rhs: Self) {
+        *self = self.simd.shrv_u64x8(*self, rhs);
     }
 }
 impl<S: Simd> core::ops::BitAnd for mask64x8<S> {
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 7690bca33..08d5af348 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -9,9 +9,9 @@ use crate::{
 };
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 #[doc = r" The main SIMD trait, implemented by all SIMD token types."]
 #[doc = r""]
@@ -61,7 +61,13 @@ pub trait Simd:
         > + SimdCvtFloat<Self::u32s>
         + SimdCvtFloat<Self::i32s>;
     #[doc = r" A native-width SIMD vector of [`f64`]s."]
-    type f64s: SimdFloat<Self, Element = f64, Block = f64x2<Self>, Mask = Self::mask64s>;
+    type f64s: SimdFloat<
+            Self,
+            Element = f64,
+            Block = f64x2<Self>,
+            Mask = Self::mask64s,
+            Bytes = <Self::u64s as Bytes>::Bytes,
+        >;
     #[doc = r" A native-width SIMD vector of [`u8`]s."]
     type u8s: SimdInt<Self, Element = u8, Block = u8x16<Self>, Mask = Self::mask8s>;
     #[doc = r" A native-width SIMD vector of [`i8`]s."]
@@ -94,6 +100,16 @@ pub trait Simd:
             Bytes = <Self::u32s as Bytes>::Bytes,
         > + SimdCvtTruncate<Self::f32s>
         + core::ops::Neg<Output = Self::i32s>;
+    #[doc = r" A native-width SIMD vector of [`u64`]s."]
+    type u64s: SimdInt<Self, Element = u64, Block = u64x2<Self>, Mask = Self::mask64s>;
+    #[doc = r" A native-width SIMD vector of [`i64`]s."]
+    type i64s: SimdInt<
+            Self,
+            Element = i64,
+            Block = i64x2<Self>,
+            Mask = Self::mask64s,
+            Bytes = <Self::u64s as Bytes>::Bytes,
+        > + core::ops::Neg<Output = Self::i64s>;
     #[doc = r" A native-width SIMD mask with 8-bit lanes."]
     type mask8s: SimdMask<Self, Element = i8>
         + Select<Self::u8s>
@@ -111,7 +127,11 @@ pub trait Simd:
         + Select<Self::i32s>
         + Select<Self::mask32s>;
     #[doc = r" A native-width SIMD mask with 64-bit lanes."]
-    type mask64s: SimdMask<Self, Element = i64> + Select<Self::f64s> + Select<Self::mask64s>;
+    type mask64s: SimdMask<Self, Element = i64>
+        + Select<Self::f64s>
+        + Select<Self::u64s>
+        + Select<Self::i64s>
+        + Select<Self::mask64s>;
     #[doc = r" This SIMD token's feature level."]
     fn level(self) -> Level;
     #[doc = r" Call function with CPU features enabled."]
@@ -937,6 +957,172 @@ pub trait Simd:
     fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `f32` elements.\n\nThe number of elements in the result is twice that of the input."]
     fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self>;
+    #[doc = "Create a SIMD vector with all elements set to the given value."]
+    fn splat_i64x2(self, val: i64) -> i64x2<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2<Self>;
+    #[doc = "Convert a SIMD vector to an array."]
+    fn as_array_i64x2(self, a: i64x2<Self>) -> [i64; 2usize];
+    #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."]
+    fn as_array_ref_i64x2(self, a: &i64x2<Self>) -> &[i64; 2usize];
+    #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."]
+    fn as_array_mut_i64x2(self, a: &mut i64x2<Self>) -> &mut [i64; 2usize];
+    #[doc = "Store a SIMD vector into an array of the same length."]
+    fn store_array_i64x2(self, a: i64x2<Self>, dest: &mut [i64; 2usize]) -> ();
+    #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."]
+    fn cvt_from_bytes_i64x2(self, a: u8x16<Self>) -> i64x2<Self>;
+    #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."]
+    fn cvt_to_bytes_i64x2(self, a: i64x2<Self>) -> u8x16<Self>;
+    #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"]
+    fn slide_i64x2<const SHIFT: usize>(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Like `slide`, but operates independently on each 128-bit block."]
+    fn slide_within_blocks_i64x2<const SHIFT: usize>(
+        self,
+        a: i64x2<Self>,
+        b: i64x2<Self>,
+    ) -> i64x2<Self>;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    fn add_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    fn sub_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    fn mul_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    fn and_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    fn or_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    fn xor_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    fn not_i64x2(self, a: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    fn shl_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shlv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    fn shr_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self>;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shrv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."]
+    fn simd_eq_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."]
+    fn simd_lt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."]
+    fn simd_le_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."]
+    fn simd_ge_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."]
+    fn simd_gt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n        For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."]
+    fn interleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>);
+    #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."]
+    fn deinterleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>);
+    #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."]
+    fn select_i64x2(self, a: mask64x2<Self>, b: i64x2<Self>, c: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Return the element-wise minimum of two vectors."]
+    fn min_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Return the element-wise maximum of two vectors."]
+    fn max_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."]
+    fn combine_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x4<Self>;
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
+    fn neg_i64x2(self, a: i64x2<Self>) -> i64x2<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u8_i64x2(self, a: i64x2<Self>) -> u8x16<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u32_i64x2(self, a: i64x2<Self>) -> u32x4<Self>;
+    #[doc = "Create a SIMD vector with all elements set to the given value."]
+    fn splat_u64x2(self, val: u64) -> u64x2<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2<Self>;
+    #[doc = "Convert a SIMD vector to an array."]
+    fn as_array_u64x2(self, a: u64x2<Self>) -> [u64; 2usize];
+    #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."]
+    fn as_array_ref_u64x2(self, a: &u64x2<Self>) -> &[u64; 2usize];
+    #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."]
+    fn as_array_mut_u64x2(self, a: &mut u64x2<Self>) -> &mut [u64; 2usize];
+    #[doc = "Store a SIMD vector into an array of the same length."]
+    fn store_array_u64x2(self, a: u64x2<Self>, dest: &mut [u64; 2usize]) -> ();
+    #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."]
+    fn cvt_from_bytes_u64x2(self, a: u8x16<Self>) -> u64x2<Self>;
+    #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."]
+    fn cvt_to_bytes_u64x2(self, a: u64x2<Self>) -> u8x16<Self>;
+    #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"]
+    fn slide_u64x2<const SHIFT: usize>(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Like `slide`, but operates independently on each 128-bit block."]
+    fn slide_within_blocks_u64x2<const SHIFT: usize>(
+        self,
+        a: u64x2<Self>,
+        b: u64x2<Self>,
+    ) -> u64x2<Self>;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    fn add_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    fn sub_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    fn mul_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    fn and_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    fn or_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    fn xor_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    fn not_u64x2(self, a: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    fn shl_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shlv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    fn shr_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self>;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shrv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."]
+    fn simd_eq_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."]
+    fn simd_lt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."]
+    fn simd_le_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."]
+    fn simd_ge_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."]
+    fn simd_gt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self>;
+    #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n        For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."]
+    fn interleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>);
+    #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."]
+    fn deinterleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>);
+    #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."]
+    fn select_u64x2(self, a: mask64x2<Self>, b: u64x2<Self>, c: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Return the element-wise minimum of two vectors."]
+    fn min_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Return the element-wise maximum of two vectors."]
+    fn max_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self>;
+    #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."]
+    fn combine_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x4<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u8_u64x2(self, a: u64x2<Self>) -> u8x16<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u32_u64x2(self, a: u64x2<Self>) -> u32x4<Self>;
     #[doc = "Create a SIMD mask with all lanes set from the given boolean value."]
     fn splat_mask64x2(self, val: bool) -> mask64x2<Self>;
     #[doc = "Create a SIMD mask from signed integer mask lanes."]
@@ -1819,6 +2005,176 @@ pub trait Simd:
     fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>);
     #[doc = "Reinterpret the bits of this vector as a vector of `f32` elements.\n\nThe number of elements in the result is twice that of the input."]
     fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self>;
+    #[doc = "Create a SIMD vector with all elements set to the given value."]
+    fn splat_i64x4(self, val: i64) -> i64x4<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4<Self>;
+    #[doc = "Convert a SIMD vector to an array."]
+    fn as_array_i64x4(self, a: i64x4<Self>) -> [i64; 4usize];
+    #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."]
+    fn as_array_ref_i64x4(self, a: &i64x4<Self>) -> &[i64; 4usize];
+    #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."]
+    fn as_array_mut_i64x4(self, a: &mut i64x4<Self>) -> &mut [i64; 4usize];
+    #[doc = "Store a SIMD vector into an array of the same length."]
+    fn store_array_i64x4(self, a: i64x4<Self>, dest: &mut [i64; 4usize]) -> ();
+    #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."]
+    fn cvt_from_bytes_i64x4(self, a: u8x32<Self>) -> i64x4<Self>;
+    #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."]
+    fn cvt_to_bytes_i64x4(self, a: i64x4<Self>) -> u8x32<Self>;
+    #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"]
+    fn slide_i64x4<const SHIFT: usize>(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Like `slide`, but operates independently on each 128-bit block."]
+    fn slide_within_blocks_i64x4<const SHIFT: usize>(
+        self,
+        a: i64x4<Self>,
+        b: i64x4<Self>,
+    ) -> i64x4<Self>;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    fn add_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    fn sub_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    fn mul_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    fn and_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    fn or_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    fn xor_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    fn not_i64x4(self, a: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    fn shl_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shlv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    fn shr_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self>;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shrv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."]
+    fn simd_eq_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."]
+    fn simd_lt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."]
+    fn simd_le_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."]
+    fn simd_ge_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."]
+    fn simd_gt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n        For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."]
+    fn interleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>);
+    #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."]
+    fn deinterleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>);
+    #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."]
+    fn select_i64x4(self, a: mask64x4<Self>, b: i64x4<Self>, c: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Return the element-wise minimum of two vectors."]
+    fn min_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Return the element-wise maximum of two vectors."]
+    fn max_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."]
+    fn combine_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x8<Self>;
+    #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
+    fn split_i64x4(self, a: i64x4<Self>) -> (i64x2<Self>, i64x2<Self>);
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
+    fn neg_i64x4(self, a: i64x4<Self>) -> i64x4<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u8_i64x4(self, a: i64x4<Self>) -> u8x32<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u32_i64x4(self, a: i64x4<Self>) -> u32x8<Self>;
+    #[doc = "Create a SIMD vector with all elements set to the given value."]
+    fn splat_u64x4(self, val: u64) -> u64x4<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4<Self>;
+    #[doc = "Convert a SIMD vector to an array."]
+    fn as_array_u64x4(self, a: u64x4<Self>) -> [u64; 4usize];
+    #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."]
+    fn as_array_ref_u64x4(self, a: &u64x4<Self>) -> &[u64; 4usize];
+    #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."]
+    fn as_array_mut_u64x4(self, a: &mut u64x4<Self>) -> &mut [u64; 4usize];
+    #[doc = "Store a SIMD vector into an array of the same length."]
+    fn store_array_u64x4(self, a: u64x4<Self>, dest: &mut [u64; 4usize]) -> ();
+    #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."]
+    fn cvt_from_bytes_u64x4(self, a: u8x32<Self>) -> u64x4<Self>;
+    #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."]
+    fn cvt_to_bytes_u64x4(self, a: u64x4<Self>) -> u8x32<Self>;
+    #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"]
+    fn slide_u64x4<const SHIFT: usize>(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Like `slide`, but operates independently on each 128-bit block."]
+    fn slide_within_blocks_u64x4<const SHIFT: usize>(
+        self,
+        a: u64x4<Self>,
+        b: u64x4<Self>,
+    ) -> u64x4<Self>;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    fn add_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    fn sub_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    fn mul_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    fn and_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    fn or_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    fn xor_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    fn not_u64x4(self, a: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    fn shl_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shlv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    fn shr_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self>;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shrv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."]
+    fn simd_eq_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."]
+    fn simd_lt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."]
+    fn simd_le_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."]
+    fn simd_ge_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."]
+    fn simd_gt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self>;
+    #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n        For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."]
+    fn interleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>);
+    #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."]
+    fn deinterleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>);
+    #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."]
+    fn select_u64x4(self, a: mask64x4<Self>, b: u64x4<Self>, c: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Return the element-wise minimum of two vectors."]
+    fn min_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Return the element-wise maximum of two vectors."]
+    fn max_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self>;
+    #[doc = "Combine two vectors into a single vector with twice the width.\n\n`a` provides the lower elements and `b` provides the upper elements."]
+    fn combine_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x8<Self>;
+    #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
+    fn split_u64x4(self, a: u64x4<Self>) -> (u64x2<Self>, u64x2<Self>);
+    #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u8_u64x4(self, a: u64x4<Self>) -> u8x32<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u32_u64x4(self, a: u64x4<Self>) -> u32x8<Self>;
     #[doc = "Create a SIMD mask with all lanes set from the given boolean value."]
     fn splat_mask64x4(self, val: bool) -> mask64x4<Self>;
     #[doc = "Create a SIMD mask from signed integer mask lanes."]
@@ -2695,6 +3051,176 @@ pub trait Simd:
     fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>);
     #[doc = "Reinterpret the bits of this vector as a vector of `f32` elements.\n\nThe number of elements in the result is twice that of the input."]
     fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self>;
+    #[doc = "Create a SIMD vector with all elements set to the given value."]
+    fn splat_i64x8(self, val: i64) -> i64x8<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8<Self>;
+    #[doc = "Convert a SIMD vector to an array."]
+    fn as_array_i64x8(self, a: i64x8<Self>) -> [i64; 8usize];
+    #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."]
+    fn as_array_ref_i64x8(self, a: &i64x8<Self>) -> &[i64; 8usize];
+    #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."]
+    fn as_array_mut_i64x8(self, a: &mut i64x8<Self>) -> &mut [i64; 8usize];
+    #[doc = "Store a SIMD vector into an array of the same length."]
+    fn store_array_i64x8(self, a: i64x8<Self>, dest: &mut [i64; 8usize]) -> ();
+    #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."]
+    fn cvt_from_bytes_i64x8(self, a: u8x64<Self>) -> i64x8<Self>;
+    #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."]
+    fn cvt_to_bytes_i64x8(self, a: i64x8<Self>) -> u8x64<Self>;
+    #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"]
+    fn slide_i64x8<const SHIFT: usize>(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Like `slide`, but operates independently on each 128-bit block."]
+    fn slide_within_blocks_i64x8<const SHIFT: usize>(
+        self,
+        a: i64x8<Self>,
+        b: i64x8<Self>,
+    ) -> i64x8<Self>;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    fn add_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    fn sub_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    fn mul_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    fn and_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    fn or_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    fn xor_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    fn not_i64x8(self, a: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    fn shl_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shlv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    fn shr_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self>;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shrv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."]
+    fn simd_eq_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."]
+    fn simd_lt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."]
+    fn simd_le_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."]
+    fn simd_ge_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."]
+    fn simd_gt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n        For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."]
+    fn interleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>);
+    #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."]
+    fn deinterleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>);
+    #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."]
+    fn select_i64x8(self, a: mask64x8<Self>, b: i64x8<Self>, c: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Return the element-wise minimum of two vectors."]
+    fn min_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Return the element-wise maximum of two vectors."]
+    fn max_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
+    fn split_i64x8(self, a: i64x8<Self>) -> (i64x4<Self>, i64x4<Self>);
+    #[doc = "Negate each element of the vector, wrapping on overflow."]
+    fn neg_i64x8(self, a: i64x8<Self>) -> i64x8<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u8_i64x8(self, a: i64x8<Self>) -> u8x64<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u32_i64x8(self, a: i64x8<Self>) -> u32x16<Self>;
+    #[doc = "Create a SIMD vector with all elements set to the given value."]
+    fn splat_u64x8(self, val: u64) -> u64x8<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8<Self>;
+    #[doc = "Create a SIMD vector from an array of the same length."]
+    fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8<Self>;
+    #[doc = "Convert a SIMD vector to an array."]
+    fn as_array_u64x8(self, a: u64x8<Self>) -> [u64; 8usize];
+    #[doc = "Project a reference to a SIMD vector to a reference to the equivalent array."]
+    fn as_array_ref_u64x8(self, a: &u64x8<Self>) -> &[u64; 8usize];
+    #[doc = "Project a mutable reference to a SIMD vector to a mutable reference to the equivalent array."]
+    fn as_array_mut_u64x8(self, a: &mut u64x8<Self>) -> &mut [u64; 8usize];
+    #[doc = "Store a SIMD vector into an array of the same length."]
+    fn store_array_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> ();
+    #[doc = "Reinterpret a vector of bytes as a SIMD vector of a given type, with the equivalent byte length."]
+    fn cvt_from_bytes_u64x8(self, a: u8x64<Self>) -> u64x8<Self>;
+    #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."]
+    fn cvt_to_bytes_u64x8(self, a: u64x8<Self>) -> u8x64<Self>;
+    #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"]
+    fn slide_u64x8<const SHIFT: usize>(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Like `slide`, but operates independently on each 128-bit block."]
+    fn slide_within_blocks_u64x8<const SHIFT: usize>(
+        self,
+        a: u64x8<Self>,
+        b: u64x8<Self>,
+    ) -> u64x8<Self>;
+    #[doc = "Add two vectors element-wise, wrapping on overflow."]
+    fn add_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Subtract two vectors element-wise, wrapping on overflow."]
+    fn sub_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Multiply two vectors element-wise, wrapping on overflow."]
+    fn mul_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Compute the bitwise AND of two vectors."]
+    fn and_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Compute the bitwise OR of two vectors."]
+    fn or_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Compute the bitwise XOR of two vectors."]
+    fn xor_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Compute the bitwise NOT of the vector."]
+    fn not_u64x8(self, a: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right."]
+    fn shl_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self>;
+    #[doc = "Shift each element left by the given number of bits.\n\nBits shifted out of the left side are discarded, and zeros are shifted in on the right.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shlv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Shift each element right by the given number of bits.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated."]
+    fn shr_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self>;
+    #[doc = "Shift each element right by the corresponding element in another vector.\n\nFor unsigned integers, zeros are shifted in on the left. For signed integers, the sign bit is replicated.\n\nThis operation is not implemented in hardware on all platforms. On WebAssembly, and on x86 platforms without AVX2, this will use a fallback scalar implementation."]
+    fn shrv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Compare two vectors element-wise for equality.\n\nReturns a mask where each logical lane is true if the corresponding elements are equal, and false if not."]
+    fn simd_eq_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for less than.\n\nReturns a mask where each logical lane is true if `a` is less than `b`, and false if not."]
+    fn simd_lt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for less than or equal.\n\nReturns a mask where each logical lane is true if `a` is less than or equal to `b`, and false if not."]
+    fn simd_le_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for greater than or equal.\n\nReturns a mask where each logical lane is true if `a` is greater than or equal to `b`, and false if not."]
+    fn simd_ge_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Compare two vectors element-wise for greater than.\n\nReturns a mask where each logical lane is true if `a` is greater than `b`, and false if not."]
+    fn simd_gt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self>;
+    #[doc = "Interleave the lower half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, b0, a1, b1]`.\n\n**Note:** This operation is only useful if you need to discard elements `a2, a3, b2, b3`.\n        For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Interleave the upper half elements of two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a2, b2, a3, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a1, b0, b1`.For fully interleaving two vectors prefer `interleave`,\n        which is faster than `zip_low` followed by `zip_high` on some platforms."]
+    fn zip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Extract even-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a0, a2, b0, b2]`.\n\n**Note:** This operation is only useful if you need to discard elements `a1, a3, b1, b3`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Extract odd-indexed elements from two vectors.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `[a1, a3, b1, b3]`.\n\n**Note:** This operation is only useful if you need to discard elements `a0, a2, b0, b2`.For fully deinterleaving two vectors prefer `deinterleave`,\n        which is faster than `unzip_low` followed by `unzip_high` on some platforms."]
+    fn unzip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Interleave two vectors.\n\nThe resulting vectors contain elements taken alternately from `a` and `b`, first filling the first result, and then the second.\n\nThe reverse of this operation is `deinterleave`.\n\nFor vectors `[a0, a1, a2, a3]` and `[b0, b1, b2, b3]`, returns `([a0, b0, a1, b1], [a2, b2, a3, b3])`."]
+    fn interleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>);
+    #[doc = "Deinterleave two vectors.\n\nThe first result contains all even-indexed elements from `a` followed by all even-indexed elements from `b`. The second result contains all odd-indexed elements from `a` followed by all odd-indexed elements from `b`.\n\nThe reverse of this operation is `interleave`.\n\nFor vectors `[a0, b0, a1, b1]` and `[a2, b2, a3, b3]`, returns `([a0, a1, a2, a3], [b0, b1, b2, b3])`."]
+    fn deinterleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>);
+    #[doc = "Select elements from b and c based on the mask operand a.\n\nThis operation's behavior is unspecified if a was constructed from signed integer lanes that are neither all-zeroes (integer value 0) nor all-ones (integer value -1). See the [`Select`] trait's documentation for more information."]
+    fn select_u64x8(self, a: mask64x8<Self>, b: u64x8<Self>, c: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Return the element-wise minimum of two vectors."]
+    fn min_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Return the element-wise maximum of two vectors."]
+    fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
+    #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
+    fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>);
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
+    fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self>;
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
+    fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> ();
+    #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self>;
+    #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
+    fn reinterpret_u32_u64x8(self, a: u64x8<Self>) -> u32x16<Self>;
     #[doc = "Create a SIMD mask with all lanes set from the given boolean value."]
     fn splat_mask64x8(self, val: bool) -> mask64x8<Self>;
     #[doc = "Create a SIMD mask from signed integer mask lanes."]
@@ -2753,6 +3279,8 @@ pub(crate) mod arch_types {
         type u32x4: Copy + Send + Sync + SimdPod;
         type mask32x4: Copy + Send + Sync + SimdPod;
         type f64x2: Copy + Send + Sync + SimdPod;
+        type i64x2: Copy + Send + Sync + SimdPod;
+        type u64x2: Copy + Send + Sync + SimdPod;
         type mask64x2: Copy + Send + Sync + SimdPod;
         type f32x8: Copy + Send + Sync + SimdPod;
         type i8x32: Copy + Send + Sync + SimdPod;
@@ -2765,6 +3293,8 @@ pub(crate) mod arch_types {
         type u32x8: Copy + Send + Sync + SimdPod;
         type mask32x8: Copy + Send + Sync + SimdPod;
         type f64x4: Copy + Send + Sync + SimdPod;
+        type i64x4: Copy + Send + Sync + SimdPod;
+        type u64x4: Copy + Send + Sync + SimdPod;
         type mask64x4: Copy + Send + Sync + SimdPod;
         type f32x16: Copy + Send + Sync + SimdPod;
         type i8x64: Copy + Send + Sync + SimdPod;
@@ -2777,6 +3307,8 @@ pub(crate) mod arch_types {
         type u32x16: Copy + Send + Sync + SimdPod;
         type mask32x16: Copy + Send + Sync + SimdPod;
         type f64x8: Copy + Send + Sync + SimdPod;
+        type i64x8: Copy + Send + Sync + SimdPod;
+        type u64x8: Copy + Send + Sync + SimdPod;
         type mask64x8: Copy + Send + Sync + SimdPod;
     }
 }
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index c05fa1b73..1ea0b330d 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -117,8 +117,8 @@ impl<S: Simd> SimdBase<S> for f32x4<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self {
-        simd.load_array_f32x4(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> f32) -> Self {
+        simd.load_array_f32x4([f(0usize), f(1usize), f(2usize), f(3usize)])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -370,8 +370,25 @@ impl<S: Simd> SimdBase<S> for i8x16<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self {
-        simd.load_array_i8x16(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i8) -> Self {
+        simd.load_array_i8x16([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -555,8 +572,25 @@ impl<S: Simd> SimdBase<S> for u8x16<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self {
-        simd.load_array_u8x16(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u8) -> Self {
+        simd.load_array_u8x16([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -831,8 +865,17 @@ impl<S: Simd> SimdBase<S> for i16x8<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self {
-        simd.load_array_i16x8(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i16) -> Self {
+        simd.load_array_i16x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -1016,8 +1059,17 @@ impl<S: Simd> SimdBase<S> for u16x8<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self {
-        simd.load_array_u16x8(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u16) -> Self {
+        simd.load_array_u16x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -1292,8 +1344,8 @@ impl<S: Simd> SimdBase<S> for i32x4<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self {
-        simd.load_array_i32x4(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i32) -> Self {
+        simd.load_array_i32x4([f(0usize), f(1usize), f(2usize), f(3usize)])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -1489,8 +1541,8 @@ impl<S: Simd> SimdBase<S> for u32x4<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self {
-        simd.load_array_u32x4(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u32) -> Self {
+        simd.load_array_u32x4([f(0usize), f(1usize), f(2usize), f(3usize)])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -1777,8 +1829,8 @@ impl<S: Simd> SimdBase<S> for f64x2<S> {
         block
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self {
-        simd.load_array_f64x2(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> f64) -> Self {
+        simd.load_array_f64x2([f(0usize), f(1usize)])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -1906,6 +1958,376 @@ impl<S: Simd> crate::SimdCombine<S> for f64x2<S> {
         self.simd.combine_f64x2(self, rhs.simd_into(self.simd))
     }
 }
+#[doc = "A SIMD vector of 2 [`i64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, i64x2};\nfn construct_simd<S: Simd>(simd: S) {\n    // From a single scalar value:\n    let a = i64x2::splat(simd, 1);\n    let b = i64x2::simd_from(simd, 1);\n\n    // From a slice:\n    let c = i64x2::from_slice(simd, &[1, 2]);\n\n    // From an array:\n    let d = i64x2::simd_from(simd, [1, 2]);\n\n    // From an element-wise function:\n    let e = i64x2::from_fn(simd, |i| i as i64);\n}\n```"]
+#[derive(Clone, Copy)]
+#[repr(C, align(16))]
+pub struct i64x2<S: Simd> {
+    pub(crate) val: S::i64x2,
+    pub simd: S,
+}
+impl<S: Simd> Seal for i64x2<S> {}
+impl<S: Simd> SimdFrom<[i64; 2], S> for i64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, val: [i64; 2]) -> Self {
+        simd.load_array_i64x2(val)
+    }
+}
+impl<S: Simd> From<i64x2<S>> for [i64; 2] {
+    #[inline(always)]
+    fn from(value: i64x2<S>) -> Self {
+        value.simd.as_array_i64x2(value)
+    }
+}
+impl<S: Simd> core::ops::Deref for i64x2<S> {
+    type Target = [i64; 2];
+    #[inline(always)]
+    fn deref(&self) -> &Self::Target {
+        self.simd.as_array_ref_i64x2(self)
+    }
+}
+impl<S: Simd> core::ops::DerefMut for i64x2<S> {
+    #[inline(always)]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.simd.as_array_mut_i64x2(self)
+    }
+}
+impl<S: Simd + core::fmt::Debug> core::fmt::Debug for i64x2<S> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        crate::support::simd_debug_impl(f, "i64x2", &self.simd, self.simd.as_array_ref_i64x2(self))
+    }
+}
+impl<S: Simd> SimdFrom<i64, S> for i64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, value: i64) -> Self {
+        simd.splat_i64x2(value)
+    }
+}
+impl<S: Simd> core::ops::Index<usize> for i64x2<S> {
+    type Output = i64;
+    #[inline(always)]
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.simd.as_array_ref_i64x2(self)[i]
+    }
+}
+impl<S: Simd> core::ops::IndexMut<usize> for i64x2<S> {
+    #[inline(always)]
+    fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+        &mut self.simd.as_array_mut_i64x2(self)[i]
+    }
+}
+impl<S: Simd> Select<i64x2<S>> for mask64x2<S> {
+    #[inline(always)]
+    fn select(self, if_true: i64x2<S>, if_false: i64x2<S>) -> i64x2<S> {
+        self.simd.select_i64x2(self, if_true, if_false)
+    }
+}
+impl<S: Simd> Bytes for i64x2<S> {
+    type Bytes = u8x16<S>;
+    #[inline(always)]
+    fn to_bytes(self) -> Self::Bytes {
+        self.simd.cvt_to_bytes_i64x2(self)
+    }
+    #[inline(always)]
+    fn from_bytes(value: Self::Bytes) -> Self {
+        value.simd.cvt_from_bytes_i64x2(value)
+    }
+}
+impl<S: Simd> SimdBase<S> for i64x2<S> {
+    type Element = i64;
+    const N: usize = 2;
+    type Mask = mask64x2<S>;
+    type Block = i64x2<S>;
+    type Array = [i64; 2];
+    #[inline(always)]
+    fn witness(&self) -> S {
+        self.simd
+    }
+    #[inline(always)]
+    fn as_slice(&self) -> &[i64] {
+        self.simd.as_array_ref_i64x2(self).as_slice()
+    }
+    #[inline(always)]
+    fn as_mut_slice(&mut self) -> &mut [i64] {
+        self.simd.as_array_mut_i64x2(self).as_mut_slice()
+    }
+    #[inline(always)]
+    fn from_slice(simd: S, slice: &[i64]) -> Self {
+        simd.load_array_ref_i64x2(slice.try_into().unwrap())
+    }
+    #[inline(always)]
+    fn store_slice(&self, slice: &mut [i64]) {
+        self.simd
+            .store_array_i64x2(*self, slice.try_into().unwrap());
+    }
+    #[inline(always)]
+    fn splat(simd: S, val: i64) -> Self {
+        simd.splat_i64x2(val)
+    }
+    #[inline(always)]
+    fn block_splat(block: Self::Block) -> Self {
+        block
+    }
+    #[inline(always)]
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i64) -> Self {
+        simd.load_array_i64x2([f(0usize), f(1usize)])
+    }
+    #[inline(always)]
+    fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_i64x2::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_within_blocks_i64x2::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdInt<S> for i64x2<S> {
+    #[inline(always)]
+    fn simd_eq(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_eq_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_lt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_lt_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_le(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_le_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_ge(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_ge_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_gt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_gt_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_low_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_high_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_low_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_high_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn interleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.interleave_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn deinterleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.deinterleave_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn min(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.min_i64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn max(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.max_i64x2(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdCombine<S> for i64x2<S> {
+    type Combined = i64x4<S>;
+    #[inline(always)]
+    fn combine(self, rhs: impl SimdInto<Self, S>) -> Self::Combined {
+        self.simd.combine_i64x2(self, rhs.simd_into(self.simd))
+    }
+}
+#[doc = "A SIMD vector of 2 [`u64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, u64x2};\nfn construct_simd<S: Simd>(simd: S) {\n    // From a single scalar value:\n    let a = u64x2::splat(simd, 1);\n    let b = u64x2::simd_from(simd, 1);\n\n    // From a slice:\n    let c = u64x2::from_slice(simd, &[1, 2]);\n\n    // From an array:\n    let d = u64x2::simd_from(simd, [1, 2]);\n\n    // From an element-wise function:\n    let e = u64x2::from_fn(simd, |i| i as u64);\n}\n```"]
+#[derive(Clone, Copy)]
+#[repr(C, align(16))]
+pub struct u64x2<S: Simd> {
+    pub(crate) val: S::u64x2,
+    pub simd: S,
+}
+impl<S: Simd> Seal for u64x2<S> {}
+impl<S: Simd> SimdFrom<[u64; 2], S> for u64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, val: [u64; 2]) -> Self {
+        simd.load_array_u64x2(val)
+    }
+}
+impl<S: Simd> From<u64x2<S>> for [u64; 2] {
+    #[inline(always)]
+    fn from(value: u64x2<S>) -> Self {
+        value.simd.as_array_u64x2(value)
+    }
+}
+impl<S: Simd> core::ops::Deref for u64x2<S> {
+    type Target = [u64; 2];
+    #[inline(always)]
+    fn deref(&self) -> &Self::Target {
+        self.simd.as_array_ref_u64x2(self)
+    }
+}
+impl<S: Simd> core::ops::DerefMut for u64x2<S> {
+    #[inline(always)]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.simd.as_array_mut_u64x2(self)
+    }
+}
+impl<S: Simd + core::fmt::Debug> core::fmt::Debug for u64x2<S> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        crate::support::simd_debug_impl(f, "u64x2", &self.simd, self.simd.as_array_ref_u64x2(self))
+    }
+}
+impl<S: Simd> SimdFrom<u64, S> for u64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, value: u64) -> Self {
+        simd.splat_u64x2(value)
+    }
+}
+impl<S: Simd> core::ops::Index<usize> for u64x2<S> {
+    type Output = u64;
+    #[inline(always)]
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.simd.as_array_ref_u64x2(self)[i]
+    }
+}
+impl<S: Simd> core::ops::IndexMut<usize> for u64x2<S> {
+    #[inline(always)]
+    fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+        &mut self.simd.as_array_mut_u64x2(self)[i]
+    }
+}
+impl<S: Simd> Select<u64x2<S>> for mask64x2<S> {
+    #[inline(always)]
+    fn select(self, if_true: u64x2<S>, if_false: u64x2<S>) -> u64x2<S> {
+        self.simd.select_u64x2(self, if_true, if_false)
+    }
+}
+impl<S: Simd> Bytes for u64x2<S> {
+    type Bytes = u8x16<S>;
+    #[inline(always)]
+    fn to_bytes(self) -> Self::Bytes {
+        self.simd.cvt_to_bytes_u64x2(self)
+    }
+    #[inline(always)]
+    fn from_bytes(value: Self::Bytes) -> Self {
+        value.simd.cvt_from_bytes_u64x2(value)
+    }
+}
+impl<S: Simd> SimdBase<S> for u64x2<S> {
+    type Element = u64;
+    const N: usize = 2;
+    type Mask = mask64x2<S>;
+    type Block = u64x2<S>;
+    type Array = [u64; 2];
+    #[inline(always)]
+    fn witness(&self) -> S {
+        self.simd
+    }
+    #[inline(always)]
+    fn as_slice(&self) -> &[u64] {
+        self.simd.as_array_ref_u64x2(self).as_slice()
+    }
+    #[inline(always)]
+    fn as_mut_slice(&mut self) -> &mut [u64] {
+        self.simd.as_array_mut_u64x2(self).as_mut_slice()
+    }
+    #[inline(always)]
+    fn from_slice(simd: S, slice: &[u64]) -> Self {
+        simd.load_array_ref_u64x2(slice.try_into().unwrap())
+    }
+    #[inline(always)]
+    fn store_slice(&self, slice: &mut [u64]) {
+        self.simd
+            .store_array_u64x2(*self, slice.try_into().unwrap());
+    }
+    #[inline(always)]
+    fn splat(simd: S, val: u64) -> Self {
+        simd.splat_u64x2(val)
+    }
+    #[inline(always)]
+    fn block_splat(block: Self::Block) -> Self {
+        block
+    }
+    #[inline(always)]
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u64) -> Self {
+        simd.load_array_u64x2([f(0usize), f(1usize)])
+    }
+    #[inline(always)]
+    fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_u64x2::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_within_blocks_u64x2::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdInt<S> for u64x2<S> {
+    #[inline(always)]
+    fn simd_eq(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_eq_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_lt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_lt_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_le(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_le_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_ge(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_ge_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_gt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_gt_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_low_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_high_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_low_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_high_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn interleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.interleave_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn deinterleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.deinterleave_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn min(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.min_u64x2(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn max(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.max_u64x2(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdCombine<S> for u64x2<S> {
+    type Combined = u64x4<S>;
+    #[inline(always)]
+    fn combine(self, rhs: impl SimdInto<Self, S>) -> Self::Combined {
+        self.simd.combine_u64x2(self, rhs.simd_into(self.simd))
+    }
+}
 #[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x2};\nfn construct_mask<S: Simd>(simd: S) {\n    // From a single boolean value:\n    let a = mask64x2::splat(simd, true);\n    let b = mask64x2::simd_from(simd, true);\n\n    // From signed integer mask lanes:\n    let c = mask64x2::from_slice(simd, &[-1, 0]);\n    let d = mask64x2::simd_from(simd, [-1, 0]);\n\n    // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n    let e = mask64x2::from_bitmask(simd, 0b0001);\n\n    // By setting individual lanes:\n    let mut f = mask64x2::splat(simd, false);\n    f.set(0, true);\n}\n```"]
 #[derive(Clone, Copy)]
 pub struct mask64x2<S: Simd> {
@@ -2107,8 +2529,17 @@ impl<S: Simd> SimdBase<S> for f32x8<S> {
         block.simd.combine_f32x4(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self {
-        simd.load_array_f32x8(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> f32) -> Self {
+        simd.load_array_f32x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -2367,8 +2798,41 @@ impl<S: Simd> SimdBase<S> for i8x32<S> {
         block.simd.combine_i8x16(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self {
-        simd.load_array_i8x32(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i8) -> Self {
+        simd.load_array_i8x32([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+            f(16usize),
+            f(17usize),
+            f(18usize),
+            f(19usize),
+            f(20usize),
+            f(21usize),
+            f(22usize),
+            f(23usize),
+            f(24usize),
+            f(25usize),
+            f(26usize),
+            f(27usize),
+            f(28usize),
+            f(29usize),
+            f(30usize),
+            f(31usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -2559,8 +3023,41 @@ impl<S: Simd> SimdBase<S> for u8x32<S> {
         block.simd.combine_u8x16(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self {
-        simd.load_array_u8x32(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u8) -> Self {
+        simd.load_array_u8x32([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+            f(16usize),
+            f(17usize),
+            f(18usize),
+            f(19usize),
+            f(20usize),
+            f(21usize),
+            f(22usize),
+            f(23usize),
+            f(24usize),
+            f(25usize),
+            f(26usize),
+            f(27usize),
+            f(28usize),
+            f(29usize),
+            f(30usize),
+            f(31usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -2847,8 +3344,25 @@ impl<S: Simd> SimdBase<S> for i16x16<S> {
         block.simd.combine_i16x8(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self {
-        simd.load_array_i16x16(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i16) -> Self {
+        simd.load_array_i16x16([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -3045,8 +3559,25 @@ impl<S: Simd> SimdBase<S> for u16x16<S> {
         block.simd.combine_u16x8(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self {
-        simd.load_array_u16x16(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u16) -> Self {
+        simd.load_array_u16x16([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -3329,8 +3860,17 @@ impl<S: Simd> SimdBase<S> for i32x8<S> {
         block.simd.combine_i32x4(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self {
-        simd.load_array_i32x8(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i32) -> Self {
+        simd.load_array_i32x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -3533,8 +4073,17 @@ impl<S: Simd> SimdBase<S> for u32x8<S> {
         block.simd.combine_u32x4(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self {
-        simd.load_array_u32x8(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u32) -> Self {
+        simd.load_array_u32x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -3828,8 +4377,8 @@ impl<S: Simd> SimdBase<S> for f64x4<S> {
         block.simd.combine_f64x2(block, block)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self {
-        simd.load_array_f64x4(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> f64) -> Self {
+        simd.load_array_f64x4([f(0usize), f(1usize), f(2usize), f(3usize)])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -3964,27 +4513,411 @@ impl<S: Simd> crate::SimdCombine<S> for f64x4<S> {
         self.simd.combine_f64x4(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x4};\nfn construct_mask<S: Simd>(simd: S) {\n    // From a single boolean value:\n    let a = mask64x4::splat(simd, true);\n    let b = mask64x4::simd_from(simd, true);\n\n    // From signed integer mask lanes:\n    let c = mask64x4::from_slice(simd, &[-1, 0, 0, 0]);\n    let d = mask64x4::simd_from(simd, [-1, 0, 0, 0]);\n\n    // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n    let e = mask64x4::from_bitmask(simd, 0b0001);\n\n    // By setting individual lanes:\n    let mut f = mask64x4::splat(simd, false);\n    f.set(0, true);\n}\n```"]
+#[doc = "A SIMD vector of 4 [`i64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, i64x4};\nfn construct_simd<S: Simd>(simd: S) {\n    // From a single scalar value:\n    let a = i64x4::splat(simd, 1);\n    let b = i64x4::simd_from(simd, 1);\n\n    // From a slice:\n    let c = i64x4::from_slice(simd, &[1, 2, 3, 4]);\n\n    // From an array:\n    let d = i64x4::simd_from(simd, [1, 2, 3, 4]);\n\n    // From an element-wise function:\n    let e = i64x4::from_fn(simd, |i| i as i64);\n    # use fearless_simd::i64x2;\n    // From `Self::Block`:\n    let f = i64x4::block_splat(i64x2::simd_from(simd, [1, 2]));\n}\n```"]
 #[derive(Clone, Copy)]
-pub struct mask64x4<S: Simd> {
-    pub(crate) val: S::mask64x4,
-    pub(crate) simd: S,
+#[repr(C, align(32))]
+pub struct i64x4<S: Simd> {
+    pub(crate) val: S::i64x4,
+    pub simd: S,
 }
-impl<S: Simd> Seal for mask64x4<S> {}
-impl<S: Simd> SimdFrom<[i64; 4], S> for mask64x4<S> {
+impl<S: Simd> Seal for i64x4<S> {}
+impl<S: Simd> SimdFrom<[i64; 4], S> for i64x4<S> {
     #[inline(always)]
     fn simd_from(simd: S, val: [i64; 4]) -> Self {
-        simd.load_array_mask64x4(val)
+        simd.load_array_i64x4(val)
     }
 }
-impl<S: Simd> From<mask64x4<S>> for [i64; 4] {
+impl<S: Simd> From<i64x4<S>> for [i64; 4] {
     #[inline(always)]
-    fn from(value: mask64x4<S>) -> Self {
-        value.simd.as_array_mask64x4(value)
+    fn from(value: i64x4<S>) -> Self {
+        value.simd.as_array_i64x4(value)
     }
 }
-impl<S: Simd + core::fmt::Debug> core::fmt::Debug for mask64x4<S> {
-    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+impl<S: Simd> core::ops::Deref for i64x4<S> {
+    type Target = [i64; 4];
+    #[inline(always)]
+    fn deref(&self) -> &Self::Target {
+        self.simd.as_array_ref_i64x4(self)
+    }
+}
+impl<S: Simd> core::ops::DerefMut for i64x4<S> {
+    #[inline(always)]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.simd.as_array_mut_i64x4(self)
+    }
+}
+impl<S: Simd + core::fmt::Debug> core::fmt::Debug for i64x4<S> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        crate::support::simd_debug_impl(f, "i64x4", &self.simd, self.simd.as_array_ref_i64x4(self))
+    }
+}
+impl<S: Simd> SimdFrom<i64, S> for i64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, value: i64) -> Self {
+        simd.splat_i64x4(value)
+    }
+}
+impl<S: Simd> core::ops::Index<usize> for i64x4<S> {
+    type Output = i64;
+    #[inline(always)]
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.simd.as_array_ref_i64x4(self)[i]
+    }
+}
+impl<S: Simd> core::ops::IndexMut<usize> for i64x4<S> {
+    #[inline(always)]
+    fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+        &mut self.simd.as_array_mut_i64x4(self)[i]
+    }
+}
+impl<S: Simd> Select<i64x4<S>> for mask64x4<S> {
+    #[inline(always)]
+    fn select(self, if_true: i64x4<S>, if_false: i64x4<S>) -> i64x4<S> {
+        self.simd.select_i64x4(self, if_true, if_false)
+    }
+}
+impl<S: Simd> Bytes for i64x4<S> {
+    type Bytes = u8x32<S>;
+    #[inline(always)]
+    fn to_bytes(self) -> Self::Bytes {
+        self.simd.cvt_to_bytes_i64x4(self)
+    }
+    #[inline(always)]
+    fn from_bytes(value: Self::Bytes) -> Self {
+        value.simd.cvt_from_bytes_i64x4(value)
+    }
+}
+impl<S: Simd> SimdBase<S> for i64x4<S> {
+    type Element = i64;
+    const N: usize = 4;
+    type Mask = mask64x4<S>;
+    type Block = i64x2<S>;
+    type Array = [i64; 4];
+    #[inline(always)]
+    fn witness(&self) -> S {
+        self.simd
+    }
+    #[inline(always)]
+    fn as_slice(&self) -> &[i64] {
+        self.simd.as_array_ref_i64x4(self).as_slice()
+    }
+    #[inline(always)]
+    fn as_mut_slice(&mut self) -> &mut [i64] {
+        self.simd.as_array_mut_i64x4(self).as_mut_slice()
+    }
+    #[inline(always)]
+    fn from_slice(simd: S, slice: &[i64]) -> Self {
+        simd.load_array_ref_i64x4(slice.try_into().unwrap())
+    }
+    #[inline(always)]
+    fn store_slice(&self, slice: &mut [i64]) {
+        self.simd
+            .store_array_i64x4(*self, slice.try_into().unwrap());
+    }
+    #[inline(always)]
+    fn splat(simd: S, val: i64) -> Self {
+        simd.splat_i64x4(val)
+    }
+    #[inline(always)]
+    fn block_splat(block: Self::Block) -> Self {
+        block.simd.combine_i64x2(block, block)
+    }
+    #[inline(always)]
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i64) -> Self {
+        simd.load_array_i64x4([f(0usize), f(1usize), f(2usize), f(3usize)])
+    }
+    #[inline(always)]
+    fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_i64x4::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_within_blocks_i64x4::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdInt<S> for i64x4<S> {
+    #[inline(always)]
+    fn simd_eq(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_eq_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_lt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_lt_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_le(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_le_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_ge(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_ge_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_gt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_gt_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_low_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_high_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_low_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_high_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn interleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.interleave_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn deinterleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.deinterleave_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn min(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.min_i64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn max(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.max_i64x4(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdSplit<S> for i64x4<S> {
+    type Split = i64x2<S>;
+    #[inline(always)]
+    fn split(self) -> (Self::Split, Self::Split) {
+        self.simd.split_i64x4(self)
+    }
+}
+impl<S: Simd> crate::SimdCombine<S> for i64x4<S> {
+    type Combined = i64x8<S>;
+    #[inline(always)]
+    fn combine(self, rhs: impl SimdInto<Self, S>) -> Self::Combined {
+        self.simd.combine_i64x4(self, rhs.simd_into(self.simd))
+    }
+}
+#[doc = "A SIMD vector of 4 [`u64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, u64x4};\nfn construct_simd<S: Simd>(simd: S) {\n    // From a single scalar value:\n    let a = u64x4::splat(simd, 1);\n    let b = u64x4::simd_from(simd, 1);\n\n    // From a slice:\n    let c = u64x4::from_slice(simd, &[1, 2, 3, 4]);\n\n    // From an array:\n    let d = u64x4::simd_from(simd, [1, 2, 3, 4]);\n\n    // From an element-wise function:\n    let e = u64x4::from_fn(simd, |i| i as u64);\n    # use fearless_simd::u64x2;\n    // From `Self::Block`:\n    let f = u64x4::block_splat(u64x2::simd_from(simd, [1, 2]));\n}\n```"]
+#[derive(Clone, Copy)]
+#[repr(C, align(32))]
+pub struct u64x4<S: Simd> {
+    pub(crate) val: S::u64x4,
+    pub simd: S,
+}
+impl<S: Simd> Seal for u64x4<S> {}
+impl<S: Simd> SimdFrom<[u64; 4], S> for u64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, val: [u64; 4]) -> Self {
+        simd.load_array_u64x4(val)
+    }
+}
+impl<S: Simd> From<u64x4<S>> for [u64; 4] {
+    #[inline(always)]
+    fn from(value: u64x4<S>) -> Self {
+        value.simd.as_array_u64x4(value)
+    }
+}
+impl<S: Simd> core::ops::Deref for u64x4<S> {
+    type Target = [u64; 4];
+    #[inline(always)]
+    fn deref(&self) -> &Self::Target {
+        self.simd.as_array_ref_u64x4(self)
+    }
+}
+impl<S: Simd> core::ops::DerefMut for u64x4<S> {
+    #[inline(always)]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.simd.as_array_mut_u64x4(self)
+    }
+}
+impl<S: Simd + core::fmt::Debug> core::fmt::Debug for u64x4<S> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        crate::support::simd_debug_impl(f, "u64x4", &self.simd, self.simd.as_array_ref_u64x4(self))
+    }
+}
+impl<S: Simd> SimdFrom<u64, S> for u64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, value: u64) -> Self {
+        simd.splat_u64x4(value)
+    }
+}
+impl<S: Simd> core::ops::Index<usize> for u64x4<S> {
+    type Output = u64;
+    #[inline(always)]
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.simd.as_array_ref_u64x4(self)[i]
+    }
+}
+impl<S: Simd> core::ops::IndexMut<usize> for u64x4<S> {
+    #[inline(always)]
+    fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+        &mut self.simd.as_array_mut_u64x4(self)[i]
+    }
+}
+impl<S: Simd> Select<u64x4<S>> for mask64x4<S> {
+    #[inline(always)]
+    fn select(self, if_true: u64x4<S>, if_false: u64x4<S>) -> u64x4<S> {
+        self.simd.select_u64x4(self, if_true, if_false)
+    }
+}
+impl<S: Simd> Bytes for u64x4<S> {
+    type Bytes = u8x32<S>;
+    #[inline(always)]
+    fn to_bytes(self) -> Self::Bytes {
+        self.simd.cvt_to_bytes_u64x4(self)
+    }
+    #[inline(always)]
+    fn from_bytes(value: Self::Bytes) -> Self {
+        value.simd.cvt_from_bytes_u64x4(value)
+    }
+}
+impl<S: Simd> SimdBase<S> for u64x4<S> {
+    type Element = u64;
+    const N: usize = 4;
+    type Mask = mask64x4<S>;
+    type Block = u64x2<S>;
+    type Array = [u64; 4];
+    #[inline(always)]
+    fn witness(&self) -> S {
+        self.simd
+    }
+    #[inline(always)]
+    fn as_slice(&self) -> &[u64] {
+        self.simd.as_array_ref_u64x4(self).as_slice()
+    }
+    #[inline(always)]
+    fn as_mut_slice(&mut self) -> &mut [u64] {
+        self.simd.as_array_mut_u64x4(self).as_mut_slice()
+    }
+    #[inline(always)]
+    fn from_slice(simd: S, slice: &[u64]) -> Self {
+        simd.load_array_ref_u64x4(slice.try_into().unwrap())
+    }
+    #[inline(always)]
+    fn store_slice(&self, slice: &mut [u64]) {
+        self.simd
+            .store_array_u64x4(*self, slice.try_into().unwrap());
+    }
+    #[inline(always)]
+    fn splat(simd: S, val: u64) -> Self {
+        simd.splat_u64x4(val)
+    }
+    #[inline(always)]
+    fn block_splat(block: Self::Block) -> Self {
+        block.simd.combine_u64x2(block, block)
+    }
+    #[inline(always)]
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u64) -> Self {
+        simd.load_array_u64x4([f(0usize), f(1usize), f(2usize), f(3usize)])
+    }
+    #[inline(always)]
+    fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_u64x4::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_within_blocks_u64x4::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdInt<S> for u64x4<S> {
+    #[inline(always)]
+    fn simd_eq(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_eq_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_lt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_lt_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_le(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_le_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_ge(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_ge_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_gt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_gt_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_low_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_high_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_low_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_high_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn interleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.interleave_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn deinterleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.deinterleave_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn min(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.min_u64x4(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn max(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.max_u64x4(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdSplit<S> for u64x4<S> {
+    type Split = u64x2<S>;
+    #[inline(always)]
+    fn split(self) -> (Self::Split, Self::Split) {
+        self.simd.split_u64x4(self)
+    }
+}
+impl<S: Simd> crate::SimdCombine<S> for u64x4<S> {
+    type Combined = u64x8<S>;
+    #[inline(always)]
+    fn combine(self, rhs: impl SimdInto<Self, S>) -> Self::Combined {
+        self.simd.combine_u64x4(self, rhs.simd_into(self.simd))
+    }
+}
+#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x4};\nfn construct_mask<S: Simd>(simd: S) {\n    // From a single boolean value:\n    let a = mask64x4::splat(simd, true);\n    let b = mask64x4::simd_from(simd, true);\n\n    // From signed integer mask lanes:\n    let c = mask64x4::from_slice(simd, &[-1, 0, 0, 0]);\n    let d = mask64x4::simd_from(simd, [-1, 0, 0, 0]);\n\n    // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n    let e = mask64x4::from_bitmask(simd, 0b0001);\n\n    // By setting individual lanes:\n    let mut f = mask64x4::splat(simd, false);\n    f.set(0, true);\n}\n```"]
+#[derive(Clone, Copy)]
+pub struct mask64x4<S: Simd> {
+    pub(crate) val: S::mask64x4,
+    pub(crate) simd: S,
+}
+impl<S: Simd> Seal for mask64x4<S> {}
+impl<S: Simd> SimdFrom<[i64; 4], S> for mask64x4<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, val: [i64; 4]) -> Self {
+        simd.load_array_mask64x4(val)
+    }
+}
+impl<S: Simd> From<mask64x4<S>> for [i64; 4] {
+    #[inline(always)]
+    fn from(value: mask64x4<S>) -> Self {
+        value.simd.as_array_mask64x4(value)
+    }
+}
+impl<S: Simd + core::fmt::Debug> core::fmt::Debug for mask64x4<S> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         let lanes = self.simd.as_array_mask64x4(*self);
         crate::support::simd_debug_impl(f, "mask64x4", &self.simd, &lanes)
     }
@@ -4171,8 +5104,25 @@ impl<S: Simd> SimdBase<S> for f32x16<S> {
         block2.simd.combine_f32x8(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self {
-        simd.load_array_f32x16(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> f32) -> Self {
+        simd.load_array_f32x16([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -4426,8 +5376,73 @@ impl<S: Simd> SimdBase<S> for i8x64<S> {
         block2.simd.combine_i8x32(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self {
-        simd.load_array_i8x64(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i8) -> Self {
+        simd.load_array_i8x64([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+            f(16usize),
+            f(17usize),
+            f(18usize),
+            f(19usize),
+            f(20usize),
+            f(21usize),
+            f(22usize),
+            f(23usize),
+            f(24usize),
+            f(25usize),
+            f(26usize),
+            f(27usize),
+            f(28usize),
+            f(29usize),
+            f(30usize),
+            f(31usize),
+            f(32usize),
+            f(33usize),
+            f(34usize),
+            f(35usize),
+            f(36usize),
+            f(37usize),
+            f(38usize),
+            f(39usize),
+            f(40usize),
+            f(41usize),
+            f(42usize),
+            f(43usize),
+            f(44usize),
+            f(45usize),
+            f(46usize),
+            f(47usize),
+            f(48usize),
+            f(49usize),
+            f(50usize),
+            f(51usize),
+            f(52usize),
+            f(53usize),
+            f(54usize),
+            f(55usize),
+            f(56usize),
+            f(57usize),
+            f(58usize),
+            f(59usize),
+            f(60usize),
+            f(61usize),
+            f(62usize),
+            f(63usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -4612,8 +5627,73 @@ impl<S: Simd> SimdBase<S> for u8x64<S> {
         block2.simd.combine_u8x32(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self {
-        simd.load_array_u8x64(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u8) -> Self {
+        simd.load_array_u8x64([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+            f(16usize),
+            f(17usize),
+            f(18usize),
+            f(19usize),
+            f(20usize),
+            f(21usize),
+            f(22usize),
+            f(23usize),
+            f(24usize),
+            f(25usize),
+            f(26usize),
+            f(27usize),
+            f(28usize),
+            f(29usize),
+            f(30usize),
+            f(31usize),
+            f(32usize),
+            f(33usize),
+            f(34usize),
+            f(35usize),
+            f(36usize),
+            f(37usize),
+            f(38usize),
+            f(39usize),
+            f(40usize),
+            f(41usize),
+            f(42usize),
+            f(43usize),
+            f(44usize),
+            f(45usize),
+            f(46usize),
+            f(47usize),
+            f(48usize),
+            f(49usize),
+            f(50usize),
+            f(51usize),
+            f(52usize),
+            f(53usize),
+            f(54usize),
+            f(55usize),
+            f(56usize),
+            f(57usize),
+            f(58usize),
+            f(59usize),
+            f(60usize),
+            f(61usize),
+            f(62usize),
+            f(63usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -4894,8 +5974,41 @@ impl<S: Simd> SimdBase<S> for i16x32<S> {
         block2.simd.combine_i16x16(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self {
-        simd.load_array_i16x32(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i16) -> Self {
+        simd.load_array_i16x32([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+            f(16usize),
+            f(17usize),
+            f(18usize),
+            f(19usize),
+            f(20usize),
+            f(21usize),
+            f(22usize),
+            f(23usize),
+            f(24usize),
+            f(25usize),
+            f(26usize),
+            f(27usize),
+            f(28usize),
+            f(29usize),
+            f(30usize),
+            f(31usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -5086,8 +6199,41 @@ impl<S: Simd> SimdBase<S> for u16x32<S> {
         block2.simd.combine_u16x16(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self {
-        simd.load_array_u16x32(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u16) -> Self {
+        simd.load_array_u16x32([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+            f(16usize),
+            f(17usize),
+            f(18usize),
+            f(19usize),
+            f(20usize),
+            f(21usize),
+            f(22usize),
+            f(23usize),
+            f(24usize),
+            f(25usize),
+            f(26usize),
+            f(27usize),
+            f(28usize),
+            f(29usize),
+            f(30usize),
+            f(31usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -5369,8 +6515,25 @@ impl<S: Simd> SimdBase<S> for i32x16<S> {
         block2.simd.combine_i32x8(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self {
-        simd.load_array_i32x16(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i32) -> Self {
+        simd.load_array_i32x16([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -5573,8 +6736,25 @@ impl<S: Simd> SimdBase<S> for u32x16<S> {
         block2.simd.combine_u32x8(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self {
-        simd.load_array_u32x16(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u32) -> Self {
+        simd.load_array_u32x16([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+            f(8usize),
+            f(9usize),
+            f(10usize),
+            f(11usize),
+            f(12usize),
+            f(13usize),
+            f(14usize),
+            f(15usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -5863,8 +7043,17 @@ impl<S: Simd> SimdBase<S> for f64x8<S> {
         block2.simd.combine_f64x4(block2, block2)
     }
     #[inline(always)]
-    fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self {
-        simd.load_array_f64x8(core::array::from_fn(f))
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> f64) -> Self {
+        simd.load_array_f64x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
     }
     #[inline(always)]
     fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
@@ -5992,6 +7181,396 @@ impl<S: Simd> crate::SimdSplit<S> for f64x8<S> {
         self.simd.split_f64x8(self)
     }
 }
+#[doc = "A SIMD vector of 8 [`i64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, i64x8};\nfn construct_simd<S: Simd>(simd: S) {\n    // From a single scalar value:\n    let a = i64x8::splat(simd, 1);\n    let b = i64x8::simd_from(simd, 1);\n\n    // From a slice:\n    let c = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);\n\n    // From an array:\n    let d = i64x8::simd_from(simd, [1, 2, 3, 4, 5, 6, 7, 8]);\n\n    // From an element-wise function:\n    let e = i64x8::from_fn(simd, |i| i as i64);\n    # use fearless_simd::i64x2;\n    // From `Self::Block`:\n    let f = i64x8::block_splat(i64x2::simd_from(simd, [1, 2]));\n}\n```"]
+#[derive(Clone, Copy)]
+#[repr(C, align(64))]
+pub struct i64x8<S: Simd> {
+    pub(crate) val: S::i64x8,
+    pub simd: S,
+}
+impl<S: Simd> Seal for i64x8<S> {}
+impl<S: Simd> SimdFrom<[i64; 8], S> for i64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, val: [i64; 8]) -> Self {
+        simd.load_array_i64x8(val)
+    }
+}
+impl<S: Simd> From<i64x8<S>> for [i64; 8] {
+    #[inline(always)]
+    fn from(value: i64x8<S>) -> Self {
+        value.simd.as_array_i64x8(value)
+    }
+}
+impl<S: Simd> core::ops::Deref for i64x8<S> {
+    type Target = [i64; 8];
+    #[inline(always)]
+    fn deref(&self) -> &Self::Target {
+        self.simd.as_array_ref_i64x8(self)
+    }
+}
+impl<S: Simd> core::ops::DerefMut for i64x8<S> {
+    #[inline(always)]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.simd.as_array_mut_i64x8(self)
+    }
+}
+impl<S: Simd + core::fmt::Debug> core::fmt::Debug for i64x8<S> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        crate::support::simd_debug_impl(f, "i64x8", &self.simd, self.simd.as_array_ref_i64x8(self))
+    }
+}
+impl<S: Simd> SimdFrom<i64, S> for i64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, value: i64) -> Self {
+        simd.splat_i64x8(value)
+    }
+}
+impl<S: Simd> core::ops::Index<usize> for i64x8<S> {
+    type Output = i64;
+    #[inline(always)]
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.simd.as_array_ref_i64x8(self)[i]
+    }
+}
+impl<S: Simd> core::ops::IndexMut<usize> for i64x8<S> {
+    #[inline(always)]
+    fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+        &mut self.simd.as_array_mut_i64x8(self)[i]
+    }
+}
+impl<S: Simd> Select<i64x8<S>> for mask64x8<S> {
+    #[inline(always)]
+    fn select(self, if_true: i64x8<S>, if_false: i64x8<S>) -> i64x8<S> {
+        self.simd.select_i64x8(self, if_true, if_false)
+    }
+}
+impl<S: Simd> Bytes for i64x8<S> {
+    type Bytes = u8x64<S>;
+    #[inline(always)]
+    fn to_bytes(self) -> Self::Bytes {
+        self.simd.cvt_to_bytes_i64x8(self)
+    }
+    #[inline(always)]
+    fn from_bytes(value: Self::Bytes) -> Self {
+        value.simd.cvt_from_bytes_i64x8(value)
+    }
+}
+impl<S: Simd> SimdBase<S> for i64x8<S> {
+    type Element = i64;
+    const N: usize = 8;
+    type Mask = mask64x8<S>;
+    type Block = i64x2<S>;
+    type Array = [i64; 8];
+    #[inline(always)]
+    fn witness(&self) -> S {
+        self.simd
+    }
+    #[inline(always)]
+    fn as_slice(&self) -> &[i64] {
+        self.simd.as_array_ref_i64x8(self).as_slice()
+    }
+    #[inline(always)]
+    fn as_mut_slice(&mut self) -> &mut [i64] {
+        self.simd.as_array_mut_i64x8(self).as_mut_slice()
+    }
+    #[inline(always)]
+    fn from_slice(simd: S, slice: &[i64]) -> Self {
+        simd.load_array_ref_i64x8(slice.try_into().unwrap())
+    }
+    #[inline(always)]
+    fn store_slice(&self, slice: &mut [i64]) {
+        self.simd
+            .store_array_i64x8(*self, slice.try_into().unwrap());
+    }
+    #[inline(always)]
+    fn splat(simd: S, val: i64) -> Self {
+        simd.splat_i64x8(val)
+    }
+    #[inline(always)]
+    fn block_splat(block: Self::Block) -> Self {
+        let block2 = block.simd.combine_i64x2(block, block);
+        block2.simd.combine_i64x4(block2, block2)
+    }
+    #[inline(always)]
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> i64) -> Self {
+        simd.load_array_i64x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
+    }
+    #[inline(always)]
+    fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_i64x8::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_within_blocks_i64x8::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdInt<S> for i64x8<S> {
+    #[inline(always)]
+    fn simd_eq(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_eq_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_lt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_lt_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_le(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_le_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_ge(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_ge_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_gt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_gt_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_low_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_high_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_low_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_high_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn interleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.interleave_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn deinterleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.deinterleave_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn min(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.min_i64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn max(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.max_i64x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdSplit<S> for i64x8<S> {
+    type Split = i64x4<S>;
+    #[inline(always)]
+    fn split(self) -> (Self::Split, Self::Split) {
+        self.simd.split_i64x8(self)
+    }
+}
+#[doc = "A SIMD vector of 8 [`u64`] elements.\n\nYou may construct this vector type using the [`Self::splat`], [`Self::from_slice`], [`Self::simd_from`], [`Self::from_fn`], and [`Self::block_splat`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, u64x8};\nfn construct_simd<S: Simd>(simd: S) {\n    // From a single scalar value:\n    let a = u64x8::splat(simd, 1);\n    let b = u64x8::simd_from(simd, 1);\n\n    // From a slice:\n    let c = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);\n\n    // From an array:\n    let d = u64x8::simd_from(simd, [1, 2, 3, 4, 5, 6, 7, 8]);\n\n    // From an element-wise function:\n    let e = u64x8::from_fn(simd, |i| i as u64);\n    # use fearless_simd::u64x2;\n    // From `Self::Block`:\n    let f = u64x8::block_splat(u64x2::simd_from(simd, [1, 2]));\n}\n```"]
+#[derive(Clone, Copy)]
+#[repr(C, align(64))]
+pub struct u64x8<S: Simd> {
+    pub(crate) val: S::u64x8,
+    pub simd: S,
+}
+impl<S: Simd> Seal for u64x8<S> {}
+impl<S: Simd> SimdFrom<[u64; 8], S> for u64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, val: [u64; 8]) -> Self {
+        simd.load_array_u64x8(val)
+    }
+}
+impl<S: Simd> From<u64x8<S>> for [u64; 8] {
+    #[inline(always)]
+    fn from(value: u64x8<S>) -> Self {
+        value.simd.as_array_u64x8(value)
+    }
+}
+impl<S: Simd> core::ops::Deref for u64x8<S> {
+    type Target = [u64; 8];
+    #[inline(always)]
+    fn deref(&self) -> &Self::Target {
+        self.simd.as_array_ref_u64x8(self)
+    }
+}
+impl<S: Simd> core::ops::DerefMut for u64x8<S> {
+    #[inline(always)]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.simd.as_array_mut_u64x8(self)
+    }
+}
+impl<S: Simd + core::fmt::Debug> core::fmt::Debug for u64x8<S> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        crate::support::simd_debug_impl(f, "u64x8", &self.simd, self.simd.as_array_ref_u64x8(self))
+    }
+}
+impl<S: Simd> SimdFrom<u64, S> for u64x8<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, value: u64) -> Self {
+        simd.splat_u64x8(value)
+    }
+}
+impl<S: Simd> core::ops::Index<usize> for u64x8<S> {
+    type Output = u64;
+    #[inline(always)]
+    fn index(&self, i: usize) -> &Self::Output {
+        &self.simd.as_array_ref_u64x8(self)[i]
+    }
+}
+impl<S: Simd> core::ops::IndexMut<usize> for u64x8<S> {
+    #[inline(always)]
+    fn index_mut(&mut self, i: usize) -> &mut Self::Output {
+        &mut self.simd.as_array_mut_u64x8(self)[i]
+    }
+}
+impl<S: Simd> Select<u64x8<S>> for mask64x8<S> {
+    #[inline(always)]
+    fn select(self, if_true: u64x8<S>, if_false: u64x8<S>) -> u64x8<S> {
+        self.simd.select_u64x8(self, if_true, if_false)
+    }
+}
+impl<S: Simd> Bytes for u64x8<S> {
+    type Bytes = u8x64<S>;
+    #[inline(always)]
+    fn to_bytes(self) -> Self::Bytes {
+        self.simd.cvt_to_bytes_u64x8(self)
+    }
+    #[inline(always)]
+    fn from_bytes(value: Self::Bytes) -> Self {
+        value.simd.cvt_from_bytes_u64x8(value)
+    }
+}
+impl<S: Simd> SimdBase<S> for u64x8<S> {
+    type Element = u64;
+    const N: usize = 8;
+    type Mask = mask64x8<S>;
+    type Block = u64x2<S>;
+    type Array = [u64; 8];
+    #[inline(always)]
+    fn witness(&self) -> S {
+        self.simd
+    }
+    #[inline(always)]
+    fn as_slice(&self) -> &[u64] {
+        self.simd.as_array_ref_u64x8(self).as_slice()
+    }
+    #[inline(always)]
+    fn as_mut_slice(&mut self) -> &mut [u64] {
+        self.simd.as_array_mut_u64x8(self).as_mut_slice()
+    }
+    #[inline(always)]
+    fn from_slice(simd: S, slice: &[u64]) -> Self {
+        simd.load_array_ref_u64x8(slice.try_into().unwrap())
+    }
+    #[inline(always)]
+    fn store_slice(&self, slice: &mut [u64]) {
+        self.simd
+            .store_array_u64x8(*self, slice.try_into().unwrap());
+    }
+    #[inline(always)]
+    fn splat(simd: S, val: u64) -> Self {
+        simd.splat_u64x8(val)
+    }
+    #[inline(always)]
+    fn block_splat(block: Self::Block) -> Self {
+        let block2 = block.simd.combine_u64x2(block, block);
+        block2.simd.combine_u64x4(block2, block2)
+    }
+    #[inline(always)]
+    fn from_fn(simd: S, mut f: impl FnMut(usize) -> u64) -> Self {
+        simd.load_array_u64x8([
+            f(0usize),
+            f(1usize),
+            f(2usize),
+            f(3usize),
+            f(4usize),
+            f(5usize),
+            f(6usize),
+            f(7usize),
+        ])
+    }
+    #[inline(always)]
+    fn slide<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_u64x8::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn slide_within_blocks<const SHIFT: usize>(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd
+            .slide_within_blocks_u64x8::<SHIFT>(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdInt<S> for u64x8<S> {
+    #[inline(always)]
+    fn simd_eq(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_eq_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_lt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_lt_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_le(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_le_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_ge(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_ge_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn simd_gt(self, rhs: impl SimdInto<Self, S>) -> Self::Mask {
+        self.simd.simd_gt_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_low_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn zip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.zip_high_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_low(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_low_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn unzip_high(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.unzip_high_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn interleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.interleave_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn deinterleave(self, rhs: impl SimdInto<Self, S>) -> (Self, Self) {
+        self.simd.deinterleave_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn min(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.min_u64x8(self, rhs.simd_into(self.simd))
+    }
+    #[inline(always)]
+    fn max(self, rhs: impl SimdInto<Self, S>) -> Self {
+        self.simd.max_u64x8(self, rhs.simd_into(self.simd))
+    }
+}
+impl<S: Simd> crate::SimdSplit<S> for u64x8<S> {
+    type Split = u64x4<S>;
+    #[inline(always)]
+    fn split(self) -> (Self::Split, Self::Split) {
+        self.simd.split_u64x8(self)
+    }
+}
 #[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque and may vary depending on the SIMD level.\n\nYou can construct this mask type using the [`Self::splat`], [`Self::from_bitmask`], [`Self::from_slice`], and [`Self::simd_from`] methods.\n\n```rust\n# use fearless_simd::{prelude::*, mask64x8};\nfn construct_mask<S: Simd>(simd: S) {\n    // From a single boolean value:\n    let a = mask64x8::splat(simd, true);\n    let b = mask64x8::simd_from(simd, true);\n\n    // From signed integer mask lanes:\n    let c = mask64x8::from_slice(simd, &[-1, 0, 0, 0, 0, 0, 0, 0]);\n    let d = mask64x8::simd_from(simd, [-1, 0, 0, 0, 0, 0, 0, 0]);\n\n    // From a compact bitmask (same mask as above, least significant bit maps to lane 0):\n    let e = mask64x8::from_bitmask(simd, 0b0001);\n\n    // By setting individual lanes:\n    let mut f = mask64x8::splat(simd, false);\n    f.set(0, true);\n}\n```"]
 #[derive(Clone, Copy)]
 pub struct mask64x8<S: Simd> {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index abbac0c52..264c6990b 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -6,9 +6,9 @@
 use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
@@ -44,6 +44,8 @@ impl ArchTypes for Sse4_2 {
     type u32x4 = crate::support::Aligned128<__m128i>;
     type mask32x4 = crate::support::Aligned128<__m128i>;
     type f64x2 = crate::support::Aligned128<__m128d>;
+    type i64x2 = crate::support::Aligned128<__m128i>;
+    type u64x2 = crate::support::Aligned128<__m128i>;
     type mask64x2 = crate::support::Aligned128<__m128i>;
     type f32x8 = crate::support::Aligned256<[__m128; 2usize]>;
     type i8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
@@ -56,6 +58,8 @@ impl ArchTypes for Sse4_2 {
     type u32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
     type mask32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
     type f64x4 = crate::support::Aligned256<[__m128d; 2usize]>;
+    type i64x4 = crate::support::Aligned256<[__m128i; 2usize]>;
+    type u64x4 = crate::support::Aligned256<[__m128i; 2usize]>;
     type mask64x4 = crate::support::Aligned256<[__m128i; 2usize]>;
     type f32x16 = crate::support::Aligned512<[__m128; 4usize]>;
     type i8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
@@ -68,6 +72,8 @@ impl ArchTypes for Sse4_2 {
     type u32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
     type mask32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
     type f64x8 = crate::support::Aligned512<[__m128d; 4usize]>;
+    type i64x8 = crate::support::Aligned512<[__m128i; 4usize]>;
+    type u64x8 = crate::support::Aligned512<[__m128i; 4usize]>;
     type mask64x8 = crate::support::Aligned512<[__m128i; 4usize]>;
 }
 impl Simd for Sse4_2 {
@@ -79,6 +85,8 @@ impl Simd for Sse4_2 {
     type i16s = i16x8<Self>;
     type u32s = u32x4<Self>;
     type i32s = i32x4<Self>;
+    type u64s = u64x2<Self>;
+    type i64s = i64x2<Self>;
     type mask8s = mask8x16<Self>;
     type mask16s = mask16x8<Self>;
     type mask32s = mask32x4<Self>;
@@ -796,7 +804,27 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i8; 16usize] = a.into();
+        let b: [i8; 16usize] = b.into();
+        let result: [i8; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
@@ -816,7 +844,27 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i8; 16usize] = a.into();
+        let b: [i8; 16usize] = b.into();
+        let result: [i8; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
@@ -1161,7 +1209,27 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u8; 16usize] = a.into();
+        let b: [u8; 16usize] = b.into();
+        let result: [u8; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
@@ -1181,7 +1249,27 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u8; 16usize] = a.into();
+        let b: [u8; 16usize] = b.into();
+        let result: [u8; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
@@ -1686,7 +1774,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i16; 8usize] = a.into();
+        let b: [i16; 8usize] = b.into();
+        let result: [i16; 8usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
@@ -1700,7 +1800,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i16; 8usize] = a.into();
+        let b: [i16; 8usize] = b.into();
+        let result: [i16; 8usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
@@ -2032,7 +2144,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u16; 8usize] = a.into();
+        let b: [u16; 8usize] = b.into();
+        let result: [u16; 8usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
@@ -2046,7 +2170,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u16; 8usize] = a.into();
+        let b: [u16; 8usize] = b.into();
+        let result: [u16; 8usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
@@ -2546,7 +2682,15 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i32; 4usize] = a.into();
+        let b: [i32; 4usize] = b.into();
+        let result: [i32; 4usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
@@ -2560,7 +2704,15 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i32; 4usize] = a.into();
+        let b: [i32; 4usize] = b.into();
+        let result: [i32; 4usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
@@ -2900,7 +3052,15 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u32; 4usize] = a.into();
+        let b: [u32; 4usize] = b.into();
+        let result: [u32; 4usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
@@ -2914,7 +3074,15 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u32; 4usize] = a.into();
+        let b: [u32; 4usize] = b.into();
+        let result: [u32; 4usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
@@ -3659,4082 +3827,4794 @@ impl Simd for Sse4_2 {
         kernel(self, a)
     }
     #[inline(always)]
-    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+    fn splat_i64x2(self, val: i64) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, val: bool) -> mask64x2<Sse4_2> {
-                let val: i64 = if val { !0 } else { 0 };
+            fn kernel(token: Sse4_2, val: i64) -> i64x2<Sse4_2> {
                 _mm_set1_epi64x(val).simd_into(token)
             }
         );
         kernel(self, val)
     }
     #[inline(always)]
-    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
-        mask64x2 {
+    fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+    fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x2(self, a: i64x2<Self>) -> [i64; 2usize] {
         crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+    fn as_array_ref_i64x2(self, a: &i64x2<Self>) -> &[i64; 2usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [i64; 2usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i64x2(self, a: &mut i64x2<Self>) -> &mut [i64; 2usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [i64; 2usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i64x2(self, a: i64x2<Self>, dest: &mut [i64; 2usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x2(self, a: u8x16<Self>) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x2<const SHIFT: usize>(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i64x2(b).val.0,
+            self.cvt_to_bytes_i64x2(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x2<const SHIFT: usize>(
+        self,
+        a: i64x2<Self>,
+        b: i64x2<Self>,
+    ) -> i64x2<Self> {
+        self.slide_i64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, bits: u64) -> mask64x2<Sse4_2> {
-                {
-                    let bit_lanes = _mm_set1_epi64x(bits.cast_signed());
-                    let bit_mask = _mm_set_epi64x(2, 1);
-                    _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
-                }
-                .simd_into(token)
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
+                _mm_add_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, bits)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+    fn sub_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> u64 {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
+                _mm_sub_epi64(a.into(), b.into()).simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 2usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            2usize
-        );
-        let mut lanes = self.as_array_mask64x2(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x2(lanes);
+    fn mul_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [
+            a[0usize].wrapping_mul(b[0usize]),
+            a[1usize].wrapping_mul(b[1usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn and_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
                 _mm_and_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn or_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
                 _mm_or_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn xor_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
                 _mm_xor_si128(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
-        self.xor_mask64x2(a, self.splat_mask64x2(true))
+    fn not_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
+        a ^ !0
     }
     #[inline(always)]
-    fn select_mask64x2(
-        self,
-        a: mask64x2<Self>,
-        b: mask64x2<Self>,
-        c: mask64x2<Self>,
-    ) -> mask64x2<Self> {
+    fn shl_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(
-                token: Sse4_2,
-                a: mask64x2<Sse4_2>,
-                b: mask64x2<Sse4_2>,
-                c: mask64x2<Sse4_2>,
-            ) -> mask64x2<Sse4_2> {
-                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, shift: u32) -> i64x2<Sse4_2> {
+                _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
             }
         );
-        kernel(self, a, b, c)
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn shlv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let result: [i64; 2usize] = [
+            core::ops::Shr::shr(a[0usize], shift),
+            core::ops::Shr::shr(a[1usize], shift),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn shrv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> mask64x2<Sse4_2> {
                 _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token)
             }
         );
         kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn simd_lt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] < b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] < b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn simd_le_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] <= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] <= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn simd_ge_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] >= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] >= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+    fn simd_gt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
-                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [i64; 2usize] = a.into();
+                let b: [i64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] > b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] > b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
             }
         );
-        kernel(self, a)
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
-        mask64x4 {
+    fn zip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn zip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn unzip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>, b: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
+    }
+    #[inline(always)]
+    fn interleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn select_i64x2(self, a: mask64x2<Self>, b: i64x2<Self>, c: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Sse4_2,
+                a: mask64x2<Sse4_2>,
+                b: i64x2<Sse4_2>,
+                c: i64x2<Sse4_2>,
+            ) -> i64x2<Sse4_2> {
+                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
+    }
+    #[inline(always)]
+    fn min_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn max_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn combine_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x4<Self> {
+        i64x4 {
             val: crate::support::Aligned256([a.val.0, b.val.0]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
-        let half = self.splat_f32x4(val);
-        self.combine_f32x4(half, half)
+    fn neg_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>) -> i64x2<Sse4_2> {
+                _mm_sub_epi64(_mm_setzero_si128(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
+    fn reinterpret_u8_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>) -> u8x16<Sse4_2> {
+                __m128i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x2(self, a: i64x2<Self>) -> u32x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: i64x2<Sse4_2>) -> u32x4<Sse4_2> {
+                __m128i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn splat_u64x2(self, val: u64) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, val: u64) -> u64x2<Sse4_2> {
+                _mm_set1_epi64x(val.cast_signed()).simd_into(token)
+            }
+        );
+        kernel(self, val)
+    }
+    #[inline(always)]
+    fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
+    fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[__m128; 2usize], [f32; 8usize]>(&a.val.0)
+    fn as_array_u64x2(self, a: u64x2<Self>) -> [u64; 2usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [u64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
-        crate::transmute::checked_cast_ref::<[__m128; 2usize], [f32; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x2(self, a: &u64x2<Self>) -> &[u64; 2usize] {
+        crate::transmute::checked_cast_ref::<__m128i, [u64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
-        crate::transmute::checked_cast_mut::<[__m128; 2usize], [f32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x2(self, a: &mut u64x2<Self>) -> &mut [u64; 2usize] {
+        crate::transmute::checked_cast_mut::<__m128i, [u64; 2usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
+    fn store_array_u64x2(self, a: u64x2<Self>, dest: &mut [u64; 2usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
-        f32x8 {
+    fn cvt_from_bytes_u64x2(self, a: u8x16<Self>) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_to_bytes_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        u8x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        if SHIFT >= 8usize {
+    fn slide_u64x2<const SHIFT: usize>(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        if SHIFT >= 2usize {
             return b;
         }
-        let result = cross_block_alignr_128x2(
+        let result = dyn_alignr_128(
             self,
-            self.cvt_to_bytes_f32x8(b).val.0,
-            self.cvt_to_bytes_f32x8(a).val.0,
-            SHIFT * 4usize,
+            self.cvt_to_bytes_u64x2(b).val.0,
+            self.cvt_to_bytes_u64x2(a).val.0,
+            SHIFT * 8usize,
         );
-        self.cvt_from_bytes_f32x8(u8x32 {
-            val: crate::support::Aligned256(result),
+        self.cvt_from_bytes_u64x2(u8x16 {
+            val: crate::support::Aligned128(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x2<const SHIFT: usize>(
         self,
-        a: f32x8<Self>,
-        b: f32x8<Self>,
-    ) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
+        a: u64x2<Self>,
+        b: u64x2<Self>,
+    ) -> u64x2<Self> {
+        self.slide_u64x2::<SHIFT>(a, b)
     }
     #[inline(always)]
-    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
+    fn add_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_add_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
+    fn sub_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_sub_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(
-            self.approximate_recip_f32x4(a0),
-            self.approximate_recip_f32x4(a1),
-        )
+    fn mul_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [
+            a[0usize].wrapping_mul(b[0usize]),
+            a[1usize].wrapping_mul(b[1usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
+    fn and_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_and_si128(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
+    fn or_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_or_si128(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
+    fn xor_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_xor_si128(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
+    fn not_u64x2(self, a: u64x2<Self>) -> u64x2<Self> {
+        a ^ !0
     }
     #[inline(always)]
-    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
+    fn shl_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, shift: u32) -> u64x2<Sse4_2> {
+                _mm_sll_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
+    fn shlv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
+    fn shr_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, shift: u32) -> u64x2<Sse4_2> {
+                _mm_srl_epi64(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(token)
+            }
+        );
+        kernel(self, a, shift)
     }
     #[inline(always)]
-    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
+    fn shrv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
+    fn simd_eq_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
+    fn simd_lt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] < b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] < b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, _) = self.split_f32x8(a);
-        let (b0, _) = self.split_f32x8(b);
-        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
+    fn simd_le_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] <= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] <= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (_, a1) = self.split_f32x8(a);
-        let (_, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
+    fn simd_ge_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] >= b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] >= b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
+    fn simd_gt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                let a: [u64; 2usize] = a.into();
+                let b: [u64; 2usize] = b.into();
+                let true_lane: i64 = !0;
+                let false_lane: i64 = 0;
+                let result: [i64; 2usize] = [
+                    if a[0usize] > b[0usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                    if a[1usize] > b[1usize] {
+                        true_lane
+                    } else {
+                        false_lane
+                    },
+                ];
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
+    fn zip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let lo_lo = self.zip_low_f32x4(a0, b0);
-        let lo_hi = self.zip_high_f32x4(a0, b0);
-        let hi_lo = self.zip_low_f32x4(a1, b1);
-        let hi_hi = self.zip_high_f32x4(a1, b1);
-        (
-            self.combine_f32x4(lo_lo, lo_hi),
-            self.combine_f32x4(hi_lo, hi_hi),
-        )
+    fn zip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let lo_even = self.unzip_low_f32x4(a0, a1);
-        let lo_odd = self.unzip_high_f32x4(a0, a1);
-        let hi_even = self.unzip_low_f32x4(b0, b1);
-        let hi_odd = self.unzip_high_f32x4(b0, b1);
-        (
-            self.combine_f32x4(lo_even, hi_even),
-            self.combine_f32x4(lo_odd, hi_odd),
-        )
+    fn unzip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_unpacklo_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
+    fn unzip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>, b: u64x2<Sse4_2>) -> u64x2<Sse4_2> {
+                _mm_unpackhi_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
+    fn interleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b))
     }
     #[inline(always)]
-    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.max_precise_f32x4(a0, b0),
-            self.max_precise_f32x4(a1, b1),
-        )
+    fn deinterleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b))
     }
     #[inline(always)]
-    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.min_precise_f32x4(a0, b0),
-            self.min_precise_f32x4(a1, b1),
-        )
+    fn select_u64x2(self, a: mask64x2<Self>, b: u64x2<Self>, c: u64x2<Self>) -> u64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Sse4_2,
+                a: mask64x2<Sse4_2>,
+                b: u64x2<Sse4_2>,
+                c: u64x2<Sse4_2>,
+            ) -> u64x2<Sse4_2> {
+                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(
-            self.mul_add_f32x4(a0, b0, c0),
-            self.mul_add_f32x4(a1, b1, c1),
-        )
+    fn min_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(
-            self.mul_sub_f32x4(a0, b0, c0),
-            self.mul_sub_f32x4(a1, b1, c1),
-        )
+    fn max_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
+    fn combine_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x4<Self> {
+        u64x4 {
+            val: crate::support::Aligned256([a.val.0, b.val.0]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
+    fn reinterpret_u8_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>) -> u8x16<Sse4_2> {
+                __m128i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(
-            self.round_ties_even_f32x4(a0),
-            self.round_ties_even_f32x4(a1),
-        )
+    fn reinterpret_u32_u64x2(self, a: u64x2<Self>) -> u32x4<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x2<Sse4_2>) -> u32x4<Sse4_2> {
+                __m128i::from(a).simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
+    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, val: bool) -> mask64x2<Sse4_2> {
+                let val: i64 = if val { !0 } else { 0 };
+                _mm_set1_epi64x(val).simd_into(token)
+            }
+        );
+        kernel(self, val)
     }
     #[inline(always)]
-    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
+    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+        mask64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
+    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+        crate::transmute::checked_transmute_copy::<__m128i, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
-        f32x16 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, bits: u64) -> mask64x2<Sse4_2> {
+                {
+                    let bit_lanes = _mm_set1_epi64x(bits.cast_signed());
+                    let bit_mask = _mm_set_epi64x(2, 1);
+                    _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+                }
+                .simd_into(token)
+            }
+        );
+        kernel(self, bits)
     }
     #[inline(always)]
-    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
-        (
-            f32x4 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            f32x4 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> u64 {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f64x2(
-            self.reinterpret_f64_f32x4(a0),
-            self.reinterpret_f64_f32x4(a1),
-        )
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(
-            self.reinterpret_i32_f32x4(a0),
-            self.reinterpret_i32_f32x4(a1),
-        )
+    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                _mm_and_si128(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
+    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                _mm_or_si128(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_f32x4(a0),
-            self.reinterpret_u32_f32x4(a1),
-        )
+    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                _mm_xor_si128(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
+    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+        self.xor_mask64x2(a, self.splat_mask64x2(true))
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(
-            self.cvt_u32_precise_f32x4(a0),
-            self.cvt_u32_precise_f32x4(a1),
-        )
+    fn select_mask64x2(
+        self,
+        a: mask64x2<Self>,
+        b: mask64x2<Self>,
+        c: mask64x2<Self>,
+    ) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(
+                token: Sse4_2,
+                a: mask64x2<Sse4_2>,
+                b: mask64x2<Sse4_2>,
+                c: mask64x2<Sse4_2>,
+            ) -> mask64x2<Sse4_2> {
+                _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b, c)
     }
     #[inline(always)]
-    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
+    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>, b: mask64x2<Sse4_2>) -> mask64x2<Sse4_2> {
+                _mm_cmpeq_epi64(a.into(), b.into()).simd_into(token)
+            }
+        );
+        kernel(self, a, b)
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(
-            self.cvt_i32_precise_f32x4(a0),
-            self.cvt_i32_precise_f32x4(a1),
-        )
+    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
-        let half = self.splat_i8x16(val);
-        self.combine_i8x16(half, half)
+    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
+    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask64x2<Sse4_2>) -> bool {
+                _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: crate::support::Aligned256([a.val.0, b.val.0]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+        let half = self.splat_f32x4(val);
+        self.combine_f32x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
+    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0)
+    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m128; 2usize], [f32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0)
+    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m128; 2usize], [f32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m128; 2usize], [f32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
-        i8x32 {
+    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
+        f32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_alignr_128x2(
             self,
-            self.cvt_to_bytes_i8x32(b).val.0,
-            self.cvt_to_bytes_i8x32(a).val.0,
-            SHIFT,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            self.cvt_to_bytes_f32x8(a).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_i8x32(u8x32 {
+        self.cvt_from_bytes_f32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+    fn slide_within_blocks_f32x8<const SHIFT: usize>(
         self,
-        a: i8x32<Self>,
-        b: i8x32<Self>,
-    ) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(
-            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
+        a: f32x8<Self>,
+        b: f32x8<Self>,
+    ) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
+        )
     }
     #[inline(always)]
-    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
+    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
     }
     #[inline(always)]
-    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
+    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
     }
     #[inline(always)]
-    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
+    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
-    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
     }
     #[inline(always)]
-    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
+    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
+    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
+    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
+    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
+    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
+    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
+    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
+    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
+    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
+    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, _) = self.split_i8x32(a);
-        let (b0, _) = self.split_i8x32(b);
-        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
+    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, _) = self.split_f32x8(a);
+        let (b0, _) = self.split_f32x8(b);
+        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (_, a1) = self.split_i8x32(a);
-        let (_, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
+    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (_, a1) = self.split_f32x8(a);
+        let (_, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
+    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
+    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let lo_lo = self.zip_low_i8x16(a0, b0);
-        let lo_hi = self.zip_high_i8x16(a0, b0);
-        let hi_lo = self.zip_low_i8x16(a1, b1);
-        let hi_hi = self.zip_high_i8x16(a1, b1);
+    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let lo_lo = self.zip_low_f32x4(a0, b0);
+        let lo_hi = self.zip_high_f32x4(a0, b0);
+        let hi_lo = self.zip_low_f32x4(a1, b1);
+        let hi_hi = self.zip_high_f32x4(a1, b1);
         (
-            self.combine_i8x16(lo_lo, lo_hi),
-            self.combine_i8x16(hi_lo, hi_hi),
+            self.combine_f32x4(lo_lo, lo_hi),
+            self.combine_f32x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let lo_even = self.unzip_low_i8x16(a0, a1);
-        let lo_odd = self.unzip_high_i8x16(a0, a1);
-        let hi_even = self.unzip_low_i8x16(b0, b1);
-        let hi_odd = self.unzip_high_i8x16(b0, b1);
+    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let lo_even = self.unzip_low_f32x4(a0, a1);
+        let lo_odd = self.unzip_high_f32x4(a0, a1);
+        let hi_even = self.unzip_low_f32x4(b0, b1);
+        let hi_odd = self.unzip_high_f32x4(b0, b1);
         (
-            self.combine_i8x16(lo_even, hi_even),
-            self.combine_i8x16(lo_odd, hi_odd),
+            self.combine_f32x4(lo_even, hi_even),
+            self.combine_f32x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let (c0, c1) = self.split_i8x32(c);
-        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
+    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
+    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
+    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.max_precise_f32x4(a0, b0),
+            self.max_precise_f32x4(a1, b1),
+        )
     }
     #[inline(always)]
-    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
-        i8x64 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.min_precise_f32x4(a0, b0),
+            self.min_precise_f32x4(a1, b1),
+        )
     }
     #[inline(always)]
-    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
-        (
-            i8x16 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            i8x16 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
+    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_add_f32x4(a0, b0, c0),
+            self.mul_add_f32x4(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
-    }
+    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_sub_f32x4(a0, b0, c0),
+            self.mul_sub_f32x4(a1, b1, c1),
+        )
+    }
     #[inline(always)]
-    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
+    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i8x32(a);
+    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.round_ties_even_f32x4(a0),
+            self.round_ties_even_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
+    }
+    #[inline(always)]
+    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
+    }
+    #[inline(always)]
+    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+        f32x16 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
+        (
+            f32x4 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            f32x4 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f64x2(
+            self.reinterpret_f64_f32x4(a0),
+            self.reinterpret_f64_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(
+            self.reinterpret_i32_f32x4(a0),
+            self.reinterpret_i32_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
         self.combine_u32x4(
-            self.reinterpret_u32_i8x16(a0),
-            self.reinterpret_u32_i8x16(a1),
+            self.reinterpret_u32_f32x4(a0),
+            self.reinterpret_u32_f32x4(a1),
         )
     }
     #[inline(always)]
-    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
-        let half = self.splat_u8x16(val);
-        self.combine_u8x16(half, half)
+    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
     }
     #[inline(always)]
-    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u32x4(
+            self.cvt_u32_precise_f32x4(a0),
+            self.cvt_u32_precise_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(
+            self.cvt_i32_precise_f32x4(a0),
+            self.cvt_i32_precise_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+        let half = self.splat_i8x16(val);
+        self.combine_i8x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0)
+    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0)
+    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         if SHIFT >= 32usize {
             return b;
         }
         let result = cross_block_alignr_128x2(
             self,
-            self.cvt_to_bytes_u8x32(b).val.0,
-            self.cvt_to_bytes_u8x32(a).val.0,
+            self.cvt_to_bytes_i8x32(b).val.0,
+            self.cvt_to_bytes_i8x32(a).val.0,
             SHIFT,
         );
-        self.cvt_from_bytes_u8x32(u8x32 {
+        self.cvt_from_bytes_i8x32(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+    fn slide_within_blocks_i8x32<const SHIFT: usize>(
         self,
-        a: u8x32<Self>,
-        b: u8x32<Self>,
-    ) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(
-            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
+        a: i8x32<Self>,
+        b: i8x32<Self>,
+    ) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(
+            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
+    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
+    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
+    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
+    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
+    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
+    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
+    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
     }
     #[inline(always)]
-    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
+    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
+    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
+    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
+    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
+    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
+    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
+    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
+    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
+    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, _) = self.split_u8x32(a);
-        let (b0, _) = self.split_u8x32(b);
-        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
+    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, _) = self.split_i8x32(a);
+        let (b0, _) = self.split_i8x32(b);
+        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (_, a1) = self.split_u8x32(a);
-        let (_, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (_, a1) = self.split_i8x32(a);
+        let (_, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let lo_lo = self.zip_low_u8x16(a0, b0);
-        let lo_hi = self.zip_high_u8x16(a0, b0);
-        let hi_lo = self.zip_low_u8x16(a1, b1);
-        let hi_hi = self.zip_high_u8x16(a1, b1);
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        let lo_lo = self.zip_low_i8x16(a0, b0);
+        let lo_hi = self.zip_high_i8x16(a0, b0);
+        let hi_lo = self.zip_low_i8x16(a1, b1);
+        let hi_hi = self.zip_high_i8x16(a1, b1);
         (
-            self.combine_u8x16(lo_lo, lo_hi),
-            self.combine_u8x16(hi_lo, hi_hi),
+            self.combine_i8x16(lo_lo, lo_hi),
+            self.combine_i8x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let lo_even = self.unzip_low_u8x16(a0, a1);
-        let lo_odd = self.unzip_high_u8x16(a0, a1);
-        let hi_even = self.unzip_low_u8x16(b0, b1);
-        let hi_odd = self.unzip_high_u8x16(b0, b1);
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        let lo_even = self.unzip_low_i8x16(a0, a1);
+        let lo_odd = self.unzip_high_i8x16(a0, a1);
+        let hi_even = self.unzip_low_i8x16(b0, b1);
+        let hi_odd = self.unzip_high_i8x16(b0, b1);
         (
-            self.combine_u8x16(lo_even, hi_even),
-            self.combine_u8x16(lo_odd, hi_odd),
+            self.combine_i8x16(lo_even, hi_even),
+            self.combine_i8x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let (c0, c1) = self.split_u8x32(c);
-        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
+        let (b0, b1) = self.split_i8x32(b);
+        let (c0, c1) = self.split_i8x32(c);
+        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
+    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
+    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+        i8x64 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
         (
-            u8x16 {
+            i8x16 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            u8x16 {
+            i8x16 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
+    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u8x32(a);
+    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i8x32(a);
         self.combine_u32x4(
-            self.reinterpret_u32_u8x16(a0),
-            self.reinterpret_u32_u8x16(a1),
+            self.reinterpret_u32_i8x16(a0),
+            self.reinterpret_u32_i8x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
-        let half = self.splat_mask8x16(val);
-        self.combine_mask8x16(half, half)
+    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+        let half = self.splat_u8x16(val);
+        self.combine_u8x16(half, half)
     }
     #[inline(always)]
-    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
-        mask8x32 {
+    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0)
+    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
-        let lo = self.from_bitmask_mask8x16(bits);
-        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
-        self.combine_mask8x16(lo, hi)
+    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
-        let (lo, hi) = self.split_mask8x32(a);
-        let lo = self.to_bitmask_mask8x16(lo);
-        let hi = self.to_bitmask_mask8x16(hi);
-        lo | (hi << 16usize)
+    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 32usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32usize
-        );
-        let mut lanes = self.as_array_mask8x32(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x32(lanes);
+    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
+    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
+    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
+    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
+    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        if SHIFT >= 32usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            self.cvt_to_bytes_u8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask8x32(
+    fn slide_within_blocks_u8x32<const SHIFT: usize>(
         self,
-        a: mask8x32<Self>,
-        b: mask8x32<Self>,
-        c: mask8x32<Self>,
-    ) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        let (c0, c1) = self.split_mask8x32(c);
-        self.combine_mask8x16(
-            self.select_mask8x16(a0, b0, c0),
-            self.select_mask8x16(a1, b1, c1),
+        a: u8x32<Self>,
+        b: u8x32<Self>,
+    ) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(
+            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
-    }
-    #[inline(always)]
-    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
+    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
+    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
+    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
+    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
-        (
-            mask8x16 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask8x16 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
-        let half = self.splat_i16x8(val);
-        self.combine_i16x8(half, half)
+    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
     }
     #[inline(always)]
-    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
     }
     #[inline(always)]
-    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0)
+    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
     }
     #[inline(always)]
-    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0)
+    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i16; 16usize]>(&mut a.val.0)
+    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_alignr_128x2(
-            self,
-            self.cvt_to_bytes_i16x16(b).val.0,
-            self.cvt_to_bytes_i16x16(a).val.0,
-            SHIFT * 2usize,
-        );
-        self.cvt_from_bytes_i16x16(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x16<const SHIFT: usize>(
-        self,
-        a: i16x16<Self>,
-        b: i16x16<Self>,
-    ) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(
-            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
-        )
+    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, _) = self.split_u8x32(a);
+        let (b0, _) = self.split_u8x32(b);
+        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
     }
     #[inline(always)]
-    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (_, a1) = self.split_u8x32(a);
+        let (_, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
     }
     #[inline(always)]
-    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
     }
     #[inline(always)]
-    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let lo_lo = self.zip_low_u8x16(a0, b0);
+        let lo_hi = self.zip_high_u8x16(a0, b0);
+        let hi_lo = self.zip_low_u8x16(a1, b1);
+        let hi_hi = self.zip_high_u8x16(a1, b1);
+        (
+            self.combine_u8x16(lo_lo, lo_hi),
+            self.combine_u8x16(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let lo_even = self.unzip_low_u8x16(a0, a1);
+        let lo_odd = self.unzip_high_u8x16(a0, a1);
+        let hi_even = self.unzip_low_u8x16(b0, b1);
+        let hi_odd = self.unzip_high_u8x16(b0, b1);
+        (
+            self.combine_u8x16(lo_even, hi_even),
+            self.combine_u8x16(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
+    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let (c0, c1) = self.split_u8x32(c);
+        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
     }
     #[inline(always)]
-    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
+    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
+    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
+    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
+    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        (
+            u8x16 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            u8x16 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
+    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
     }
     #[inline(always)]
-    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
+    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u8x16(a0),
+            self.reinterpret_u32_u8x16(a1),
+        )
     }
     #[inline(always)]
-    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
+    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
+        let half = self.splat_mask8x16(val);
+        self.combine_mask8x16(half, half)
     }
     #[inline(always)]
-    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
+    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+        mask8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
+    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        let lo = self.from_bitmask_mask8x16(bits);
+        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
+        self.combine_mask8x16(lo, hi)
     }
     #[inline(always)]
-    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, _) = self.split_i16x16(a);
-        let (b0, _) = self.split_i16x16(b);
-        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x32(a);
+        let lo = self.to_bitmask_mask8x16(lo);
+        let hi = self.to_bitmask_mask8x16(hi);
+        lo | (hi << 16usize)
     }
     #[inline(always)]
-    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (_, a1) = self.split_i16x16(a);
-        let (_, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
     }
     #[inline(always)]
-    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
+    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
+    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let lo_lo = self.zip_low_i16x8(a0, b0);
-        let lo_hi = self.zip_high_i16x8(a0, b0);
-        let hi_lo = self.zip_low_i16x8(a1, b1);
-        let hi_hi = self.zip_high_i16x8(a1, b1);
-        (
-            self.combine_i16x8(lo_lo, lo_hi),
-            self.combine_i16x8(hi_lo, hi_hi),
-        )
+    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let lo_even = self.unzip_low_i16x8(a0, a1);
-        let lo_odd = self.unzip_high_i16x8(a0, a1);
-        let hi_even = self.unzip_low_i16x8(b0, b1);
-        let hi_odd = self.unzip_high_i16x8(b0, b1);
-        (
-            self.combine_i16x8(lo_even, hi_even),
-            self.combine_i16x8(lo_odd, hi_odd),
+    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
+    }
+    #[inline(always)]
+    fn select_mask8x32(
+        self,
+        a: mask8x32<Self>,
+        b: mask8x32<Self>,
+        c: mask8x32<Self>,
+    ) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        let (c0, c1) = self.split_mask8x32(c);
+        self.combine_mask8x16(
+            self.select_mask8x16(a0, b0, c0),
+            self.select_mask8x16(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let (c0, c1) = self.split_i16x16(c);
-        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
+    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
+    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
     }
     #[inline(always)]
-    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
+    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
     }
     #[inline(always)]
-    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
+        mask8x64 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
         (
-            i16x8 {
+            mask8x16 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            i16x8 {
+            mask8x16 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_i16x8(a0),
-            self.reinterpret_u32_i16x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
-        let half = self.splat_u16x8(val);
-        self.combine_u16x8(half, half)
+    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+        let half = self.splat_i16x8(val);
+        self.combine_i16x8(half, half)
     }
     #[inline(always)]
-    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0)
+    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0)
+    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u16; 16usize]>(&mut a.val.0)
+    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i16; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
-        u16x16 {
+    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
+        i16x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_alignr_128x2(
             self,
-            self.cvt_to_bytes_u16x16(b).val.0,
-            self.cvt_to_bytes_u16x16(a).val.0,
+            self.cvt_to_bytes_i16x16(b).val.0,
+            self.cvt_to_bytes_i16x16(a).val.0,
             SHIFT * 2usize,
         );
-        self.cvt_from_bytes_u16x16(u8x32 {
+        self.cvt_from_bytes_i16x16(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+    fn slide_within_blocks_i16x16<const SHIFT: usize>(
         self,
-        a: u16x16<Self>,
-        b: u16x16<Self>,
-    ) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(
-            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
+        a: i16x16<Self>,
+        b: i16x16<Self>,
+    ) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(
+            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
+        )
     }
     #[inline(always)]
-    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
+    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
+    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
+    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
+    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
+    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
+    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
+    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
     }
     #[inline(always)]
-    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
+    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
     }
     #[inline(always)]
-    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
+    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
+    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
     }
     #[inline(always)]
-    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
+    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
+    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
+    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
+    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
+    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, _) = self.split_u16x16(a);
-        let (b0, _) = self.split_u16x16(b);
-        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
+    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (_, a1) = self.split_u16x16(a);
-        let (_, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
+    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, _) = self.split_i16x16(a);
+        let (b0, _) = self.split_i16x16(b);
+        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
     }
     #[inline(always)]
-    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
+    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (_, a1) = self.split_i16x16(a);
+        let (_, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
+    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let lo_lo = self.zip_low_u16x8(a0, b0);
-        let lo_hi = self.zip_high_u16x8(a0, b0);
-        let hi_lo = self.zip_low_u16x8(a1, b1);
-        let hi_hi = self.zip_high_u16x8(a1, b1);
+    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        let lo_lo = self.zip_low_i16x8(a0, b0);
+        let lo_hi = self.zip_high_i16x8(a0, b0);
+        let hi_lo = self.zip_low_i16x8(a1, b1);
+        let hi_hi = self.zip_high_i16x8(a1, b1);
         (
-            self.combine_u16x8(lo_lo, lo_hi),
-            self.combine_u16x8(hi_lo, hi_hi),
+            self.combine_i16x8(lo_lo, lo_hi),
+            self.combine_i16x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let lo_even = self.unzip_low_u16x8(a0, a1);
-        let lo_odd = self.unzip_high_u16x8(a0, a1);
-        let hi_even = self.unzip_low_u16x8(b0, b1);
-        let hi_odd = self.unzip_high_u16x8(b0, b1);
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        let lo_even = self.unzip_low_i16x8(a0, a1);
+        let lo_odd = self.unzip_high_i16x8(a0, a1);
+        let hi_even = self.unzip_low_i16x8(b0, b1);
+        let hi_odd = self.unzip_high_i16x8(b0, b1);
         (
-            self.combine_u16x8(lo_even, hi_even),
-            self.combine_u16x8(lo_odd, hi_odd),
+            self.combine_i16x8(lo_even, hi_even),
+            self.combine_i16x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let (c0, c1) = self.split_u16x16(c);
-        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
+        let (b0, b1) = self.split_i16x16(b);
+        let (c0, c1) = self.split_i16x16(c);
+        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
+    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
+    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+        i16x32 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
         (
-            u16x8 {
+            i16x8 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            u16x8 {
+            i16x8 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, a: u16x16<Sse4_2>) -> u8x16<Sse4_2> {
-                let (a, b) = token.split_u16x16(a);
-                let mask = _mm_set1_epi16(0xFF);
-                let lo_masked = _mm_and_si128(a.into(), mask);
-                let hi_masked = _mm_and_si128(b.into(), mask);
-                let result = _mm_packus_epi16(lo_masked, hi_masked);
-                result.simd_into(token)
-            }
-        );
-        kernel(self, a)
+    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
+    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u16x16(a);
+    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i16x16(a);
         self.combine_u32x4(
-            self.reinterpret_u32_u16x8(a0),
-            self.reinterpret_u32_u16x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
-        let half = self.splat_mask16x8(val);
-        self.combine_mask16x8(half, half)
-    }
-    #[inline(always)]
-    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
-        mask16x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
-        let lo = self.from_bitmask_mask16x8(bits);
-        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
-        self.combine_mask16x8(lo, hi)
-    }
-    #[inline(always)]
-    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask16x16<Sse4_2>) -> u64 {
-                {
-                    let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
-                    _mm_movemask_epi8(packed) as u32 as u64
-                }
-            }
-        );
-        kernel(self, a)
-    }
-    #[inline(always)]
-    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let mut lanes = self.as_array_mask16x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x16(lanes);
-    }
-    #[inline(always)]
-    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
-    }
-    #[inline(always)]
-    fn select_mask16x16(
-        self,
-        a: mask16x16<Self>,
-        b: mask16x16<Self>,
-        c: mask16x16<Self>,
-    ) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        let (c0, c1) = self.split_mask16x16(c);
-        self.combine_mask16x8(
-            self.select_mask16x8(a0, b0, c0),
-            self.select_mask16x8(a1, b1, c1),
-        )
-    }
-    #[inline(always)]
-    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
-    }
-    #[inline(always)]
-    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
-    }
-    #[inline(always)]
-    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
-    }
-    #[inline(always)]
-    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
-    }
-    #[inline(always)]
-    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
-    }
-    #[inline(always)]
-    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
-        (
-            mask16x8 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask16x8 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
+            self.reinterpret_u32_i16x8(a0),
+            self.reinterpret_u32_i16x8(a1),
         )
     }
     #[inline(always)]
-    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
-        let half = self.splat_i32x4(val);
-        self.combine_i32x4(half, half)
+    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+        let half = self.splat_u16x8(val);
+        self.combine_u16x8(half, half)
     }
     #[inline(always)]
-    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
+    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
+    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0)
+    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0)
+    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u16; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
+    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
-        i32x8 {
+    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        if SHIFT >= 8usize {
+    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_alignr_128x2(
             self,
-            self.cvt_to_bytes_i32x8(b).val.0,
-            self.cvt_to_bytes_i32x8(a).val.0,
-            SHIFT * 4usize,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            self.cvt_to_bytes_u16x16(a).val.0,
+            SHIFT * 2usize,
         );
-        self.cvt_from_bytes_i32x8(u8x32 {
+        self.cvt_from_bytes_u16x16(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x8<const SHIFT: usize>(
+    fn slide_within_blocks_u16x16<const SHIFT: usize>(
         self,
-        a: i32x8<Self>,
-        b: i32x8<Self>,
-    ) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(
-            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
+        a: u16x16<Self>,
+        b: u16x16<Self>,
+    ) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(
+            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
+    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
+    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
+    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
+    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
+    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
+    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
+    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
     }
     #[inline(always)]
-    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
+    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
+    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
+    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
+    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
+    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
+    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
+    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
+    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
+    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, _) = self.split_i32x8(a);
-        let (b0, _) = self.split_i32x8(b);
-        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
+    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, _) = self.split_u16x16(a);
+        let (b0, _) = self.split_u16x16(b);
+        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (_, a1) = self.split_i32x8(a);
-        let (_, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
+    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (_, a1) = self.split_u16x16(a);
+        let (_, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
+    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
+    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let lo_lo = self.zip_low_i32x4(a0, b0);
-        let lo_hi = self.zip_high_i32x4(a0, b0);
-        let hi_lo = self.zip_low_i32x4(a1, b1);
-        let hi_hi = self.zip_high_i32x4(a1, b1);
+    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let lo_lo = self.zip_low_u16x8(a0, b0);
+        let lo_hi = self.zip_high_u16x8(a0, b0);
+        let hi_lo = self.zip_low_u16x8(a1, b1);
+        let hi_hi = self.zip_high_u16x8(a1, b1);
         (
-            self.combine_i32x4(lo_lo, lo_hi),
-            self.combine_i32x4(hi_lo, hi_hi),
+            self.combine_u16x8(lo_lo, lo_hi),
+            self.combine_u16x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let lo_even = self.unzip_low_i32x4(a0, a1);
-        let lo_odd = self.unzip_high_i32x4(a0, a1);
-        let hi_even = self.unzip_low_i32x4(b0, b1);
-        let hi_odd = self.unzip_high_i32x4(b0, b1);
+    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let lo_even = self.unzip_low_u16x8(a0, a1);
+        let lo_odd = self.unzip_high_u16x8(a0, a1);
+        let hi_even = self.unzip_low_u16x8(b0, b1);
+        let hi_odd = self.unzip_high_u16x8(b0, b1);
         (
-            self.combine_i32x4(lo_even, hi_even),
-            self.combine_i32x4(lo_odd, hi_odd),
+            self.combine_u16x8(lo_even, hi_even),
+            self.combine_u16x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let (c0, c1) = self.split_i32x8(c);
-        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
+    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let (c0, c1) = self.split_u16x16(c);
+        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
+    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
+    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
-        i32x16 {
+    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+        u16x32 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
+    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
         (
-            i32x4 {
+            u16x8 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            i32x4 {
+            u16x8 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
+    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u16x16<Sse4_2>) -> u8x16<Sse4_2> {
+                let (a, b) = token.split_u16x16(a);
+                let mask = _mm_set1_epi16(0xFF);
+                let lo_masked = _mm_and_si128(a.into(), mask);
+                let hi_masked = _mm_and_si128(b.into(), mask);
+                let result = _mm_packus_epi16(lo_masked, hi_masked);
+                result.simd_into(token)
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
+    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
+    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u16x16(a);
         self.combine_u32x4(
-            self.reinterpret_u32_i32x4(a0),
-            self.reinterpret_u32_i32x4(a1),
+            self.reinterpret_u32_u16x8(a0),
+            self.reinterpret_u32_u16x8(a1),
         )
     }
     #[inline(always)]
-    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
-    }
-    #[inline(always)]
-    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
-        let half = self.splat_u32x4(val);
-        self.combine_u32x4(half, half)
+    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
+        let half = self.splat_mask16x8(val);
+        self.combine_mask16x8(half, half)
     }
     #[inline(always)]
-    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
+        mask16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0)
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        let lo = self.from_bitmask_mask16x8(bits);
+        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
+        self.combine_mask16x8(lo, hi)
     }
     #[inline(always)]
-    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0)
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask16x16<Sse4_2>) -> u64 {
+                {
+                    let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
+                    _mm_movemask_epi8(packed) as u32 as u64
+                }
+            }
+        );
+        kernel(self, a)
     }
     #[inline(always)]
-    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u32; 8usize]>(&mut a.val.0)
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
     }
     #[inline(always)]
-    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
-        u32x8 {
+    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
+    }
+    #[inline(always)]
+    fn select_mask16x16(
+        self,
+        a: mask16x16<Self>,
+        b: mask16x16<Self>,
+        c: mask16x16<Self>,
+    ) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        let (c0, c1) = self.split_mask16x16(c);
+        self.combine_mask16x8(
+            self.select_mask16x8(a0, b0, c0),
+            self.select_mask16x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
+        (
+            mask16x8 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            mask16x8 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+        let half = self.splat_i32x4(val);
+        self.combine_i32x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_alignr_128x2(
             self,
-            self.cvt_to_bytes_u32x8(b).val.0,
-            self.cvt_to_bytes_u32x8(a).val.0,
+            self.cvt_to_bytes_i32x8(b).val.0,
+            self.cvt_to_bytes_i32x8(a).val.0,
             SHIFT * 4usize,
         );
-        self.cvt_from_bytes_u32x8(u8x32 {
+        self.cvt_from_bytes_i32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+    fn slide_within_blocks_i32x8<const SHIFT: usize>(
         self,
-        a: u32x8<Self>,
-        b: u32x8<Self>,
-    ) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(
-            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
+        a: i32x8<Self>,
+        b: i32x8<Self>,
+    ) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(
+            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
+    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
+    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
+    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
+    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
+    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
+    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
+    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
     }
     #[inline(always)]
-    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
+    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
+    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
+    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
+    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
+    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
+    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
+    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
+    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
+    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, _) = self.split_u32x8(a);
-        let (b0, _) = self.split_u32x8(b);
-        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, _) = self.split_i32x8(a);
+        let (b0, _) = self.split_i32x8(b);
+        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (_, a1) = self.split_u32x8(a);
-        let (_, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
+    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (_, a1) = self.split_i32x8(a);
+        let (_, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
+    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
+    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let lo_lo = self.zip_low_u32x4(a0, b0);
-        let lo_hi = self.zip_high_u32x4(a0, b0);
-        let hi_lo = self.zip_low_u32x4(a1, b1);
-        let hi_hi = self.zip_high_u32x4(a1, b1);
+    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        let lo_lo = self.zip_low_i32x4(a0, b0);
+        let lo_hi = self.zip_high_i32x4(a0, b0);
+        let hi_lo = self.zip_low_i32x4(a1, b1);
+        let hi_hi = self.zip_high_i32x4(a1, b1);
         (
-            self.combine_u32x4(lo_lo, lo_hi),
-            self.combine_u32x4(hi_lo, hi_hi),
+            self.combine_i32x4(lo_lo, lo_hi),
+            self.combine_i32x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let lo_even = self.unzip_low_u32x4(a0, a1);
-        let lo_odd = self.unzip_high_u32x4(a0, a1);
-        let hi_even = self.unzip_low_u32x4(b0, b1);
-        let hi_odd = self.unzip_high_u32x4(b0, b1);
+    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        let lo_even = self.unzip_low_i32x4(a0, a1);
+        let lo_odd = self.unzip_high_i32x4(a0, a1);
+        let hi_even = self.unzip_low_i32x4(b0, b1);
+        let hi_odd = self.unzip_high_i32x4(b0, b1);
         (
-            self.combine_u32x4(lo_even, hi_even),
-            self.combine_u32x4(lo_odd, hi_odd),
+            self.combine_i32x4(lo_even, hi_even),
+            self.combine_i32x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let (c0, c1) = self.split_u32x8(c);
-        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
+        let (b0, b1) = self.split_i32x8(b);
+        let (c0, c1) = self.split_i32x8(c);
+        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
+    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
+    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
-        u32x16 {
+    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
+        i32x16 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
         (
-            u32x4 {
+            i32x4 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            u32x4 {
+            i32x4 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
+    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
     }
     #[inline(always)]
-    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
+    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
     }
     #[inline(always)]
-    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
-        let half = self.splat_mask32x4(val);
-        self.combine_mask32x4(half, half)
+    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_i32x4(a0),
+            self.reinterpret_u32_i32x4(a1),
+        )
     }
     #[inline(always)]
-    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
-        mask32x8 {
+    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
+    }
+    #[inline(always)]
+    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+        let half = self.splat_u32x4(val);
+        self.combine_u32x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0)
+    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
-        let lo = self.from_bitmask_mask32x4(bits);
-        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
-        self.combine_mask32x4(lo, hi)
+    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
-        let (lo, hi) = self.split_mask32x8(a);
-        let lo = self.to_bitmask_mask32x4(lo);
-        let hi = self.to_bitmask_mask32x4(hi);
-        lo | (hi << 4usize)
+    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 8usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8usize
-        );
-        let mut lanes = self.as_array_mask32x8(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x8(lanes);
+    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
+    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
+    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
+    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
+    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            self.cvt_to_bytes_u32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask32x8(
+    fn slide_within_blocks_u32x8<const SHIFT: usize>(
         self,
-        a: mask32x8<Self>,
-        b: mask32x8<Self>,
-        c: mask32x8<Self>,
-    ) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        let (c0, c1) = self.split_mask32x8(c);
-        self.combine_mask32x4(
-            self.select_mask32x4(a0, b0, c0),
-            self.select_mask32x4(a1, b1, c1),
+        a: u32x8<Self>,
+        b: u32x8<Self>,
+    ) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(
+            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
+    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
-    }
+    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
+    }
     #[inline(always)]
-    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
+    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
+    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
+    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
-        (
-            mask32x4 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask32x4 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
     }
     #[inline(always)]
-    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
-        let half = self.splat_f64x2(val);
-        self.combine_f64x2(half, half)
+    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
     }
     #[inline(always)]
-    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
     }
     #[inline(always)]
-    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
-        crate::transmute::checked_transmute_copy::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0)
+    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
-        crate::transmute::checked_cast_ref::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0)
+    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
-        crate::transmute::checked_cast_mut::<[__m128d; 2usize], [f64; 4usize]>(&mut a.val.0)
+    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        if SHIFT >= 4usize {
-            return b;
-        }
-        let result = cross_block_alignr_128x2(
-            self,
-            self.cvt_to_bytes_f64x4(b).val.0,
-            self.cvt_to_bytes_f64x4(a).val.0,
-            SHIFT * 8usize,
-        );
-        self.cvt_from_bytes_f64x4(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, _) = self.split_u32x8(a);
+        let (b0, _) = self.split_u32x8(b);
+        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x4<const SHIFT: usize>(
-        self,
-        a: f64x4<Self>,
-        b: f64x4<Self>,
-    ) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
-        )
+    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (_, a1) = self.split_u32x8(a);
+        let (_, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
+    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
     }
     #[inline(always)]
-    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
+    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
     }
     #[inline(always)]
-    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
+    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let lo_lo = self.zip_low_u32x4(a0, b0);
+        let lo_hi = self.zip_high_u32x4(a0, b0);
+        let hi_lo = self.zip_low_u32x4(a1, b1);
+        let hi_hi = self.zip_high_u32x4(a1, b1);
+        (
+            self.combine_u32x4(lo_lo, lo_hi),
+            self.combine_u32x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(
-            self.approximate_recip_f64x2(a0),
-            self.approximate_recip_f64x2(a1),
+    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let lo_even = self.unzip_low_u32x4(a0, a1);
+        let lo_odd = self.unzip_high_u32x4(a0, a1);
+        let hi_even = self.unzip_low_u32x4(b0, b1);
+        let hi_odd = self.unzip_high_u32x4(b0, b1);
+        (
+            self.combine_u32x4(lo_even, hi_even),
+            self.combine_u32x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
+    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let (c0, c1) = self.split_u32x8(c);
+        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
+    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
+    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
+    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
+        u32x16 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
+    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        (
+            u32x4 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            u32x4 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
+    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
     }
     #[inline(always)]
-    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
+    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
     }
     #[inline(always)]
-    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
-    }
-    #[inline(always)]
-    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, _) = self.split_f64x4(a);
-        let (b0, _) = self.split_f64x4(b);
-        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
-    }
-    #[inline(always)]
-    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (_, a1) = self.split_f64x4(a);
-        let (_, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
+    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
+        let half = self.splat_mask32x4(val);
+        self.combine_mask32x4(half, half)
     }
     #[inline(always)]
-    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
+    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
+        mask32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
+    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let lo_lo = self.zip_low_f64x2(a0, b0);
-        let lo_hi = self.zip_high_f64x2(a0, b0);
-        let hi_lo = self.zip_low_f64x2(a1, b1);
-        let hi_hi = self.zip_high_f64x2(a1, b1);
-        (
-            self.combine_f64x2(lo_lo, lo_hi),
-            self.combine_f64x2(hi_lo, hi_hi),
-        )
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        let lo = self.from_bitmask_mask32x4(bits);
+        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
+        self.combine_mask32x4(lo, hi)
     }
     #[inline(always)]
-    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let lo_even = self.unzip_low_f64x2(a0, a1);
-        let lo_odd = self.unzip_high_f64x2(a0, a1);
-        let hi_even = self.unzip_low_f64x2(b0, b1);
-        let hi_odd = self.unzip_high_f64x2(b0, b1);
-        (
-            self.combine_f64x2(lo_even, hi_even),
-            self.combine_f64x2(lo_odd, hi_odd),
-        )
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x8(a);
+        let lo = self.to_bitmask_mask32x4(lo);
+        let hi = self.to_bitmask_mask32x4(hi);
+        lo | (hi << 4usize)
     }
     #[inline(always)]
-    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
     }
     #[inline(always)]
-    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
+    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.max_precise_f64x2(a0, b0),
-            self.max_precise_f64x2(a1, b1),
-        )
+    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.min_precise_f64x2(a0, b0),
-            self.min_precise_f64x2(a1, b1),
-        )
+    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(
-            self.mul_add_f64x2(a0, b0, c0),
-            self.mul_add_f64x2(a1, b1, c1),
-        )
+    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
     }
     #[inline(always)]
-    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(
-            self.mul_sub_f64x2(a0, b0, c0),
-            self.mul_sub_f64x2(a1, b1, c1),
+    fn select_mask32x8(
+        self,
+        a: mask32x8<Self>,
+        b: mask32x8<Self>,
+        c: mask32x8<Self>,
+    ) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        let (c0, c1) = self.split_mask32x8(c);
+        self.combine_mask32x4(
+            self.select_mask32x4(a0, b0, c0),
+            self.select_mask32x4(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
-    }
-    #[inline(always)]
-    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
+    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(
-            self.round_ties_even_f64x2(a0),
-            self.round_ties_even_f64x2(a1),
-        )
+    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
     }
     #[inline(always)]
-    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
+    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
     }
     #[inline(always)]
-    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
+    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
     }
     #[inline(always)]
-    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
+    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
     }
     #[inline(always)]
-    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
+        mask32x16 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
         (
-            f64x2 {
+            mask32x4 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            f64x2 {
+            mask32x4 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f32x4(
-            self.reinterpret_f32_f64x2(a0),
-            self.reinterpret_f32_f64x2(a1),
-        )
-    }
-    #[inline(always)]
-    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
-        let half = self.splat_mask64x2(val);
-        self.combine_mask64x2(half, half)
+    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
+        let half = self.splat_f64x2(val);
+        self.combine_f64x2(half, half)
     }
     #[inline(always)]
-    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
-        mask64x4 {
+    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0)
+    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
-        let lo = self.from_bitmask_mask64x2(bits);
-        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
-        self.combine_mask64x2(lo, hi)
+    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
-        let (lo, hi) = self.split_mask64x4(a);
-        let lo = self.to_bitmask_mask64x2(lo);
-        let hi = self.to_bitmask_mask64x2(hi);
-        lo | (hi << 2usize)
+    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
+        crate::transmute::checked_cast_ref::<[__m128d; 2usize], [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 4usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4usize
-        );
-        let mut lanes = self.as_array_mask64x4(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x4(lanes);
+    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
+        crate::transmute::checked_cast_mut::<[__m128d; 2usize], [f64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
+    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
+    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
+    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
+    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            self.cvt_to_bytes_f64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask64x4(
+    fn slide_within_blocks_f64x4<const SHIFT: usize>(
         self,
-        a: mask64x4<Self>,
-        b: mask64x4<Self>,
-        c: mask64x4<Self>,
-    ) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        let (c0, c1) = self.split_mask64x4(c);
-        self.combine_mask64x2(
-            self.select_mask64x2(a0, b0, c0),
-            self.select_mask64x2(a1, b1, c1),
+        a: f64x4<Self>,
+        b: f64x4<Self>,
+    ) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
+    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
     }
     #[inline(always)]
-    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
+    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
     }
     #[inline(always)]
-    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
+    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
-    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
+    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
-        mask64x8 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
-        (
-            mask64x2 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask64x2 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
-        let half = self.splat_f32x8(val);
-        self.combine_f32x8(half, half)
+    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m128; 4usize], [f32; 16usize]>(&a.val.0)
+    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m128; 4usize], [f32; 16usize]>(&a.val.0)
+    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m128; 4usize], [f32; 16usize]>(&mut a.val.0)
+    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, _) = self.split_f64x4(a);
+        let (b0, _) = self.split_f64x4(b);
+        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        u8x64 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (_, a1) = self.split_f64x4(a);
+        let (_, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_alignr_128x4(
-            self,
-            self.cvt_to_bytes_f32x16(b).val.0,
-            self.cvt_to_bytes_f32x16(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_f32x16(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
+    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x16<const SHIFT: usize>(
-        self,
-        a: f32x16<Self>,
-        b: f32x16<Self>,
-    ) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
-        )
+    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
     }
     #[inline(always)]
-    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
+    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let lo_lo = self.zip_low_f64x2(a0, b0);
+        let lo_hi = self.zip_high_f64x2(a0, b0);
+        let hi_lo = self.zip_low_f64x2(a1, b1);
+        let hi_hi = self.zip_high_f64x2(a1, b1);
+        (
+            self.combine_f64x2(lo_lo, lo_hi),
+            self.combine_f64x2(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
+    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let lo_even = self.unzip_low_f64x2(a0, a1);
+        let lo_odd = self.unzip_high_f64x2(a0, a1);
+        let hi_even = self.unzip_low_f64x2(b0, b1);
+        let hi_odd = self.unzip_high_f64x2(b0, b1);
+        (
+            self.combine_f64x2(lo_even, hi_even),
+            self.combine_f64x2(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
+    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.approximate_recip_f32x8(a0),
-            self.approximate_recip_f32x8(a1),
-        )
+    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
+    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.max_precise_f64x2(a0, b0),
+            self.max_precise_f64x2(a1, b1),
+        )
     }
     #[inline(always)]
-    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
+    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.min_precise_f64x2(a0, b0),
+            self.min_precise_f64x2(a1, b1),
+        )
     }
     #[inline(always)]
-    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
+    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_add_f64x2(a0, b0, c0),
+            self.mul_add_f64x2(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
+    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_sub_f64x2(a0, b0, c0),
+            self.mul_sub_f64x2(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
+    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
+    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
+    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.round_ties_even_f64x2(a0),
+            self.round_ties_even_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
+    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
+    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
+    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
     }
     #[inline(always)]
-    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, _) = self.split_f32x16(a);
-        let (b0, _) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
+    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
+        f64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (_, a1) = self.split_f32x16(a);
-        let (_, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
+    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+        (
+            f64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            f64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f32x4(
+            self.reinterpret_f32_f64x2(a0),
+            self.reinterpret_f32_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
+    fn splat_i64x4(self, val: i64) -> i64x4<Self> {
+        let half = self.splat_i64x2(val);
+        self.combine_i64x2(half, half)
     }
     #[inline(always)]
-    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_lo = self.zip_low_f32x8(a0, b0);
-        let lo_hi = self.zip_high_f32x8(a0, b0);
-        let hi_lo = self.zip_low_f32x8(a1, b1);
-        let hi_hi = self.zip_high_f32x8(a1, b1);
-        (
-            self.combine_f32x8(lo_lo, lo_hi),
-            self.combine_f32x8(hi_lo, hi_hi),
-        )
+    fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_even = self.unzip_low_f32x8(a0, a1);
-        let lo_odd = self.unzip_high_f32x8(a0, a1);
-        let hi_even = self.unzip_low_f32x8(b0, b1);
-        let hi_odd = self.unzip_high_f32x8(b0, b1);
-        (
-            self.combine_f32x8(lo_even, hi_even),
-            self.combine_f32x8(lo_odd, hi_odd),
-        )
+    fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
+    fn as_array_i64x4(self, a: i64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
+    fn as_array_ref_i64x4(self, a: &i64x4<Self>) -> &[i64; 4usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.max_precise_f32x8(a0, b0),
-            self.max_precise_f32x8(a1, b1),
-        )
+    fn as_array_mut_i64x4(self, a: &mut i64x4<Self>) -> &mut [i64; 4usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [i64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.min_precise_f32x8(a0, b0),
-            self.min_precise_f32x8(a1, b1),
-        )
+    fn store_array_i64x4(self, a: i64x4<Self>, dest: &mut [i64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_add_f32x8(a0, b0, c0),
-            self.mul_add_f32x8(a1, b1, c1),
-        )
+    fn cvt_from_bytes_i64x4(self, a: u8x32<Self>) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_sub_f32x8(a0, b0, c0),
-            self.mul_sub_f32x8(a1, b1, c1),
+    fn cvt_to_bytes_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x4<const SHIFT: usize>(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_i64x4(b).val.0,
+            self.cvt_to_bytes_i64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x4<const SHIFT: usize>(
+        self,
+        a: i64x4<Self>,
+        b: i64x4<Self>,
+    ) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(
+            self.slide_within_blocks_i64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x2::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+    fn add_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+    fn sub_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.round_ties_even_f32x8(a0),
-            self.round_ties_even_f32x8(a1),
-        )
+    fn mul_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
+    fn and_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+    fn or_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
+    fn xor_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        (
-            f32x8 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
-                simd: self,
-            },
-            f32x8 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
-                simd: self,
-            },
-        )
+    fn not_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1))
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f64x4(
-            self.reinterpret_f64_f32x8(a0),
-            self.reinterpret_f64_f32x8(a1),
-        )
+    fn shl_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift))
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.reinterpret_i32_f32x8(a0),
-            self.reinterpret_i32_f32x8(a1),
-        )
+    fn shlv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, src: &[f32; 16usize]) -> f32x16<Sse4_2> {
-                let (chunks, []) = src.as_chunks::<4usize>() else {
-                    unreachable!()
-                };
-                let v0: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
-                let v1: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
-                let v2: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
-                let v3: __m128 =
-                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
-                let tmp0 = _mm_unpacklo_ps(v0, v1);
-                let tmp1 = _mm_unpackhi_ps(v0, v1);
-                let tmp2 = _mm_unpacklo_ps(v2, v3);
-                let tmp3 = _mm_unpackhi_ps(v2, v3);
-                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                token.combine_f32x8(
-                    token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)),
-                )
-            }
-        );
-        kernel(self, src)
+    fn shr_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift))
     }
     #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, a: f32x16<Sse4_2>, dest: &mut [f32; 16usize]) -> () {
-                let (v01, v23) = token.split_f32x16(a);
-                let (v0, v1) = token.split_f32x8(v01);
-                let (v2, v3) = token.split_f32x8(v23);
-                let v0 = v0.into();
-                let v1 = v1.into();
-                let v2 = v2.into();
-                let v3 = v3.into();
-                let tmp0 = _mm_unpacklo_ps(v0, v1);
-                let tmp1 = _mm_unpackhi_ps(v0, v1);
-                let tmp2 = _mm_unpacklo_ps(v2, v3);
-                let tmp3 = _mm_unpackhi_ps(v2, v3);
-                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
-                    unreachable!()
-                };
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
-                    out0,
-                    &mut chunks[0],
-                );
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
-                    out1,
-                    &mut chunks[1],
-                );
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
-                    out2,
-                    &mut chunks[2],
-                );
-                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
-                    out3,
-                    &mut chunks[3],
-                );
-            }
-        );
-        kernel(self, a, dest);
+    fn shrv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
+    fn simd_eq_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_f32x8(a0),
-            self.reinterpret_u32_f32x8(a1),
+    fn simd_lt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, _) = self.split_i64x4(a);
+        let (b0, _) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (_, a1) = self.split_i64x4(a);
+        let (_, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_lo = self.zip_low_i64x2(a0, b0);
+        let lo_hi = self.zip_high_i64x2(a0, b0);
+        let hi_lo = self.zip_low_i64x2(a1, b1);
+        let hi_hi = self.zip_high_i64x2(a1, b1);
+        (
+            self.combine_i64x2(lo_lo, lo_hi),
+            self.combine_i64x2(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
+    fn deinterleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_even = self.unzip_low_i64x2(a0, a1);
+        let lo_odd = self.unzip_high_i64x2(a0, a1);
+        let hi_even = self.unzip_low_i64x2(b0, b1);
+        let hi_odd = self.unzip_high_i64x2(b0, b1);
+        (
+            self.combine_i64x2(lo_even, hi_even),
+            self.combine_i64x2(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.cvt_u32_precise_f32x8(a0),
-            self.cvt_u32_precise_f32x8(a1),
+    fn select_i64x4(self, a: mask64x4<Self>, b: i64x4<Self>, c: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let (c0, c1) = self.split_i64x4(c);
+        self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x8<Self> {
+        i64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_i64x4(self, a: i64x4<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (
+            i64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            i64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
+    fn neg_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1))
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.cvt_i32_precise_f32x8(a0),
-            self.cvt_i32_precise_f32x8(a1),
+    fn reinterpret_u8_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x4(self, a: i64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_i64x2(a0),
+            self.reinterpret_u32_i64x2(a1),
         )
     }
     #[inline(always)]
-    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
-        let half = self.splat_i8x32(val);
-        self.combine_i8x32(half, half)
+    fn splat_u64x4(self, val: u64) -> u64x4<Self> {
+        let half = self.splat_u64x2(val);
+        self.combine_u64x2(half, half)
     }
     #[inline(always)]
-    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_u64x4(self, a: u64x4<Self>) -> [u64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [u64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_ref_u64x4(self, a: &u64x4<Self>) -> &[u64; 4usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 2usize], [u64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_u64x4(self, a: &mut u64x4<Self>) -> &mut [u64; 4usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 2usize], [u64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+    fn store_array_u64x4(self, a: u64x4<Self>, dest: &mut [u64; 4usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
-        i8x64 {
+    fn cvt_from_bytes_u64x4(self, a: u8x32<Self>) -> u64x4<Self> {
+        u64x4 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn cvt_to_bytes_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        if SHIFT >= 64usize {
+    fn slide_u64x4<const SHIFT: usize>(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        if SHIFT >= 4usize {
             return b;
         }
-        let result = cross_block_alignr_128x4(
+        let result = cross_block_alignr_128x2(
             self,
-            self.cvt_to_bytes_i8x64(b).val.0,
-            self.cvt_to_bytes_i8x64(a).val.0,
-            SHIFT,
+            self.cvt_to_bytes_u64x4(b).val.0,
+            self.cvt_to_bytes_u64x4(a).val.0,
+            SHIFT * 8usize,
         );
-        self.cvt_from_bytes_i8x64(u8x64 {
-            val: crate::support::Aligned512(result),
+        self.cvt_from_bytes_u64x4(u8x32 {
+            val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+    fn slide_within_blocks_u64x4<const SHIFT: usize>(
         self,
-        a: i8x64<Self>,
-        b: i8x64<Self>,
-    ) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(
-            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
+        a: u64x4<Self>,
+        b: u64x4<Self>,
+    ) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(
+            self.slide_within_blocks_u64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x2::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
+    fn add_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
+    fn sub_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
+    fn mul_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
+    fn and_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
+    fn or_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
+    fn xor_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
+    fn not_u64x4(self, a: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1))
     }
     #[inline(always)]
-    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
+    fn shl_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
+    fn shlv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
+    fn shr_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
+    fn shrv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
+    fn simd_eq_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
+    fn simd_lt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
+    fn simd_le_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
+    fn simd_ge_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
+    fn simd_gt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, _) = self.split_i8x64(a);
-        let (b0, _) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
+    fn zip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, _) = self.split_u64x4(a);
+        let (b0, _) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (_, a1) = self.split_i8x64(a);
-        let (_, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
+    fn zip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (_, a1) = self.split_u64x4(a);
+        let (_, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
+    fn unzip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
+    fn unzip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_lo = self.zip_low_i8x32(a0, b0);
-        let lo_hi = self.zip_high_i8x32(a0, b0);
-        let hi_lo = self.zip_low_i8x32(a1, b1);
-        let hi_hi = self.zip_high_i8x32(a1, b1);
+    fn interleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_lo = self.zip_low_u64x2(a0, b0);
+        let lo_hi = self.zip_high_u64x2(a0, b0);
+        let hi_lo = self.zip_low_u64x2(a1, b1);
+        let hi_hi = self.zip_high_u64x2(a1, b1);
         (
-            self.combine_i8x32(lo_lo, lo_hi),
-            self.combine_i8x32(hi_lo, hi_hi),
+            self.combine_u64x2(lo_lo, lo_hi),
+            self.combine_u64x2(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_even = self.unzip_low_i8x32(a0, a1);
-        let lo_odd = self.unzip_high_i8x32(a0, a1);
-        let hi_even = self.unzip_low_i8x32(b0, b1);
-        let hi_odd = self.unzip_high_i8x32(b0, b1);
+    fn deinterleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_even = self.unzip_low_u64x2(a0, a1);
+        let lo_odd = self.unzip_high_u64x2(a0, a1);
+        let hi_even = self.unzip_low_u64x2(b0, b1);
+        let hi_odd = self.unzip_high_u64x2(b0, b1);
         (
-            self.combine_i8x32(lo_even, hi_even),
-            self.combine_i8x32(lo_odd, hi_odd),
+            self.combine_u64x2(lo_even, hi_even),
+            self.combine_u64x2(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let (c0, c1) = self.split_i8x64(c);
-        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
+    fn select_u64x4(self, a: mask64x4<Self>, b: u64x4<Self>, c: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let (c0, c1) = self.split_u64x4(c);
+        self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
+    fn min_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
+    fn max_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1))
     }
     #[inline(always)]
-    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn combine_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x8<Self> {
+        u64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_u64x4(self, a: u64x4<Self>) -> (u64x2<Self>, u64x2<Self>) {
         (
-            i8x32 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+            u64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            i8x32 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+            u64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
+    fn reinterpret_u8_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
+    fn reinterpret_u32_u64x4(self, a: u64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u64x2(a0),
+            self.reinterpret_u32_u64x2(a1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_i8x32(a0),
-            self.reinterpret_u32_i8x32(a1),
+    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+        let half = self.splat_mask64x2(val);
+        self.combine_mask64x2(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+        mask64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 2usize], [i64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        let lo = self.from_bitmask_mask64x2(bits);
+        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
+        self.combine_mask64x2(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x4(a);
+        let lo = self.to_bitmask_mask64x2(lo);
+        let hi = self.to_bitmask_mask64x2(hi);
+        lo | (hi << 2usize)
+    }
+    #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
+    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
+    }
+    #[inline(always)]
+    fn select_mask64x4(
+        self,
+        a: mask64x4<Self>,
+        b: mask64x4<Self>,
+        c: mask64x4<Self>,
+    ) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        let (c0, c1) = self.split_mask64x4(c);
+        self.combine_mask64x2(
+            self.select_mask64x2(a0, b0, c0),
+            self.select_mask64x2(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
-        let half = self.splat_u8x32(val);
-        self.combine_u8x32(half, half)
+    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
     }
     #[inline(always)]
-    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+        (
+            mask64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            mask64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
+        let half = self.splat_f32x8(val);
+        self.combine_f32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0)
+    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m128; 4usize], [f32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0)
+    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m128; 4usize], [f32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m128; 4usize], [f32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
+        f32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        if SHIFT >= 64usize {
+    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_alignr_128x4(
             self,
-            self.cvt_to_bytes_u8x64(b).val.0,
-            self.cvt_to_bytes_u8x64(a).val.0,
-            SHIFT,
+            self.cvt_to_bytes_f32x16(b).val.0,
+            self.cvt_to_bytes_f32x16(a).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_u8x64(u8x64 {
+        self.cvt_from_bytes_f32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+    fn slide_within_blocks_f32x16<const SHIFT: usize>(
         self,
-        a: u8x64<Self>,
-        b: u8x64<Self>,
-    ) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(
-            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
+        a: f32x16<Self>,
+        b: f32x16<Self>,
+    ) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
+    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
     }
     #[inline(always)]
-    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
+    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
     }
     #[inline(always)]
-    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
+    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
     }
     #[inline(always)]
-    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
     }
     #[inline(always)]
-    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
+    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
+    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
+    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
+    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
+    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
+    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
+    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
+    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
+    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
+    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
+    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, _) = self.split_f32x16(a);
+        let (b0, _) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
     }
     #[inline(always)]
-    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
+    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (_, a1) = self.split_f32x16(a);
+        let (_, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, _) = self.split_u8x64(a);
-        let (b0, _) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
+    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
     }
     #[inline(always)]
-    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (_, a1) = self.split_u8x64(a);
-        let (_, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
+    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
+    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_lo = self.zip_low_f32x8(a0, b0);
+        let lo_hi = self.zip_high_f32x8(a0, b0);
+        let hi_lo = self.zip_low_f32x8(a1, b1);
+        let hi_hi = self.zip_high_f32x8(a1, b1);
+        (
+            self.combine_f32x8(lo_lo, lo_hi),
+            self.combine_f32x8(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
+    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_even = self.unzip_low_f32x8(a0, a1);
+        let lo_odd = self.unzip_high_f32x8(a0, a1);
+        let hi_even = self.unzip_low_f32x8(b0, b1);
+        let hi_odd = self.unzip_high_f32x8(b0, b1);
+        (
+            self.combine_f32x8(lo_even, hi_even),
+            self.combine_f32x8(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_lo = self.zip_low_u8x32(a0, b0);
-        let lo_hi = self.zip_high_u8x32(a0, b0);
-        let hi_lo = self.zip_low_u8x32(a1, b1);
-        let hi_hi = self.zip_high_u8x32(a1, b1);
-        (
-            self.combine_u8x32(lo_lo, lo_hi),
-            self.combine_u8x32(hi_lo, hi_hi),
+    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.max_precise_f32x8(a0, b0),
+            self.max_precise_f32x8(a1, b1),
         )
     }
     #[inline(always)]
-    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_even = self.unzip_low_u8x32(a0, a1);
-        let lo_odd = self.unzip_high_u8x32(a0, a1);
-        let hi_even = self.unzip_low_u8x32(b0, b1);
-        let hi_odd = self.unzip_high_u8x32(b0, b1);
-        (
-            self.combine_u8x32(lo_even, hi_even),
-            self.combine_u8x32(lo_odd, hi_odd),
+    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.min_precise_f32x8(a0, b0),
+            self.min_precise_f32x8(a1, b1),
         )
     }
     #[inline(always)]
-    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let (c0, c1) = self.split_u8x64(c);
-        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
+    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_add_f32x8(a0, b0, c0),
+            self.mul_add_f32x8(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
+    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_sub_f32x8(a0, b0, c0),
+            self.mul_sub_f32x8(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
+    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
     }
     #[inline(always)]
-    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        (
-            u8x32 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
-                simd: self,
-            },
-            u8x32 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
-                simd: self,
-            },
-        )
+    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
     }
     #[inline(always)]
-    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, src: &[u8; 64usize]) -> u8x64<Sse4_2> {
-                let (chunks, []) = src.as_chunks::<16usize>() else {
+    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.round_ties_even_f32x8(a0),
+            self.round_ties_even_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
+    }
+    #[inline(always)]
+    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+    }
+    #[inline(always)]
+    fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        (
+            f32x8 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            f32x8 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f64x4(
+            self.reinterpret_f64_f32x8(a0),
+            self.reinterpret_f64_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.reinterpret_i32_f32x8(a0),
+            self.reinterpret_i32_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[f32; 16usize]) -> f32x16<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
                     unreachable!()
                 };
-                let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
-                let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
-                let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
-                let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
-                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-                let v0 = _mm_shuffle_epi8(v0, mask);
-                let v1 = _mm_shuffle_epi8(v1, mask);
-                let v2 = _mm_shuffle_epi8(v2, mask);
-                let v3 = _mm_shuffle_epi8(v3, mask);
-                let tmp0 = _mm_unpacklo_epi32(v0, v1);
-                let tmp1 = _mm_unpackhi_epi32(v0, v1);
-                let tmp2 = _mm_unpacklo_epi32(v2, v3);
-                let tmp3 = _mm_unpackhi_epi32(v2, v3);
-                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                token.combine_u8x32(
-                    token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)),
+                let v0: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
+                let v1: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
+                let v2: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
+                let v3: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                token.combine_f32x8(
+                    token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)),
                 )
             }
         );
         kernel(self, src)
     }
     #[inline(always)]
-    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: u8x64<Sse4_2>, dest: &mut [u8; 64usize]) -> () {
-                let (v01, v23) = token.split_u8x64(a);
-                let (v0, v1) = token.split_u8x32(v01);
-                let (v2, v3) = token.split_u8x32(v23);
+            fn kernel(token: Sse4_2, a: f32x16<Sse4_2>, dest: &mut [f32; 16usize]) -> () {
+                let (v01, v23) = token.split_f32x16(a);
+                let (v0, v1) = token.split_f32x8(v01);
+                let (v2, v3) = token.split_f32x8(v23);
                 let v0 = v0.into();
                 let v1 = v1.into();
                 let v2 = v2.into();
                 let v3 = v3.into();
-                let tmp0 = _mm_unpacklo_epi32(v0, v1);
-                let tmp1 = _mm_unpackhi_epi32(v0, v1);
-                let tmp2 = _mm_unpacklo_epi32(v2, v3);
-                let tmp3 = _mm_unpackhi_epi32(v2, v3);
-                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-                let out0 = _mm_shuffle_epi8(out0, mask);
-                let out1 = _mm_shuffle_epi8(out1, mask);
-                let out2 = _mm_shuffle_epi8(out2, mask);
-                let out3 = _mm_shuffle_epi8(out3, mask);
-                let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
                     unreachable!()
                 };
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
                     out0,
                     &mut chunks[0],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
                     out1,
                     &mut chunks[1],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
                     out2,
                     &mut chunks[2],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
                     out3,
                     &mut chunks[3],
                 );
@@ -7743,727 +8623,585 @@ impl Simd for Sse4_2 {
         kernel(self, a, dest);
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u8x64(a);
+    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
         self.combine_u32x8(
-            self.reinterpret_u32_u8x32(a0),
-            self.reinterpret_u32_u8x32(a1),
+            self.reinterpret_u32_f32x8(a0),
+            self.reinterpret_u32_f32x8(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
-        let half = self.splat_mask8x32(val);
-        self.combine_mask8x32(half, half)
+    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
     }
     #[inline(always)]
-    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
-        mask8x64 {
+    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(
+            self.cvt_u32_precise_f32x8(a0),
+            self.cvt_u32_precise_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.cvt_i32_precise_f32x8(a0),
+            self.cvt_i32_precise_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+        let half = self.splat_i8x32(val);
+        self.combine_i8x32(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0)
+    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, bits: u64) -> mask8x64<Sse4_2> {
-                {
-                    let bit_bytes = _mm_set1_epi64x(bits.cast_signed());
-                    let bit_mask =
-                        _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128);
-                    mask8x64 {
-                        val: crate::support::Aligned512([
-                            {
-                                let bit_bytes = _mm_shuffle_epi8(
-                                    bit_bytes,
-                                    _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1),
-                                );
-                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-                            },
-                            {
-                                let bit_bytes = _mm_shuffle_epi8(
-                                    bit_bytes,
-                                    _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3),
-                                );
-                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-                            },
-                            {
-                                let bit_bytes = _mm_shuffle_epi8(
-                                    bit_bytes,
-                                    _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5),
-                                );
-                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-                            },
-                            {
-                                let bit_bytes = _mm_shuffle_epi8(
-                                    bit_bytes,
-                                    _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7),
-                                );
-                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-                            },
-                        ]),
-                        simd: token,
-                    }
-                }
-            }
-        );
-        kernel(self, bits)
+    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
-        let (lo, hi) = self.split_mask8x64(a);
-        let lo = self.to_bitmask_mask8x32(lo);
-        let hi = self.to_bitmask_mask8x32(hi);
-        lo | (hi << 32usize)
+    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 64usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            64usize
-        );
-        let mut lanes = self.as_array_mask8x64(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x64(lanes);
-    }
-    #[inline(always)]
-    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
-    }
-    #[inline(always)]
-    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
-    }
-    #[inline(always)]
-    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
-    }
-    #[inline(always)]
-    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
-    }
-    #[inline(always)]
-    fn select_mask8x64(
-        self,
-        a: mask8x64<Self>,
-        b: mask8x64<Self>,
-        c: mask8x64<Self>,
-    ) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        let (c0, c1) = self.split_mask8x64(c);
-        self.combine_mask8x32(
-            self.select_mask8x32(a0, b0, c0),
-            self.select_mask8x32(a1, b1, c1),
-        )
-    }
-    #[inline(always)]
-    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
-    }
-    #[inline(always)]
-    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
-    }
-    #[inline(always)]
-    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
-    }
-    #[inline(always)]
-    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
-    }
-    #[inline(always)]
-    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
-    }
-    #[inline(always)]
-    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
-        (
-            mask8x32 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
-                simd: self,
-            },
-            mask8x32 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
-                simd: self,
-            },
-        )
-    }
-    #[inline(always)]
-    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
-        let half = self.splat_i16x16(val);
-        self.combine_i16x16(half, half)
-    }
-    #[inline(always)]
-    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
-    }
-    #[inline(always)]
-    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
+        i8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        if SHIFT >= 64usize {
             return b;
         }
         let result = cross_block_alignr_128x4(
             self,
-            self.cvt_to_bytes_i16x32(b).val.0,
-            self.cvt_to_bytes_i16x32(a).val.0,
-            SHIFT * 2usize,
+            self.cvt_to_bytes_i8x64(b).val.0,
+            self.cvt_to_bytes_i8x64(a).val.0,
+            SHIFT,
         );
-        self.cvt_from_bytes_i16x32(u8x64 {
+        self.cvt_from_bytes_i8x64(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+    fn slide_within_blocks_i8x64<const SHIFT: usize>(
         self,
-        a: i16x32<Self>,
-        b: i16x32<Self>,
-    ) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
+        a: i8x64<Self>,
+        b: i8x64<Self>,
+    ) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(
+            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
+    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
+    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
+    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
+    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
+    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
+    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
+    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
     }
     #[inline(always)]
-    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
+    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
+    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
+    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
+    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
+    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
+    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
+    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
+    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
+    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, _) = self.split_i16x32(a);
-        let (b0, _) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
+    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, _) = self.split_i8x64(a);
+        let (b0, _) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (_, a1) = self.split_i16x32(a);
-        let (_, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (_, a1) = self.split_i8x64(a);
+        let (_, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.unzip_high_i16x16(a0, a1),
-            self.unzip_high_i16x16(b0, b1),
-        )
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_lo = self.zip_low_i16x16(a0, b0);
-        let lo_hi = self.zip_high_i16x16(a0, b0);
-        let hi_lo = self.zip_low_i16x16(a1, b1);
-        let hi_hi = self.zip_high_i16x16(a1, b1);
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_lo = self.zip_low_i8x32(a0, b0);
+        let lo_hi = self.zip_high_i8x32(a0, b0);
+        let hi_lo = self.zip_low_i8x32(a1, b1);
+        let hi_hi = self.zip_high_i8x32(a1, b1);
         (
-            self.combine_i16x16(lo_lo, lo_hi),
-            self.combine_i16x16(hi_lo, hi_hi),
+            self.combine_i8x32(lo_lo, lo_hi),
+            self.combine_i8x32(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_even = self.unzip_low_i16x16(a0, a1);
-        let lo_odd = self.unzip_high_i16x16(a0, a1);
-        let hi_even = self.unzip_low_i16x16(b0, b1);
-        let hi_odd = self.unzip_high_i16x16(b0, b1);
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_even = self.unzip_low_i8x32(a0, a1);
+        let lo_odd = self.unzip_high_i8x32(a0, a1);
+        let hi_even = self.unzip_low_i8x32(b0, b1);
+        let hi_odd = self.unzip_high_i8x32(b0, b1);
         (
-            self.combine_i16x16(lo_even, hi_even),
-            self.combine_i16x16(lo_odd, hi_odd),
+            self.combine_i8x32(lo_even, hi_even),
+            self.combine_i8x32(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let (c0, c1) = self.split_i16x32(c);
-        self.combine_i16x16(
-            self.select_i16x16(a0, b0, c0),
-            self.select_i16x16(a1, b1, c1),
-        )
+    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let (c0, c1) = self.split_i8x64(c);
+        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
+    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
+    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
     }
     #[inline(always)]
-    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
         (
-            i16x16 {
+            i8x32 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            i16x16 {
+            i8x32 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
+    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_i16x16(a0),
-            self.reinterpret_u8_i16x16(a1),
-        )
+    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i16x32(a);
+    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i8x64(a);
         self.combine_u32x8(
-            self.reinterpret_u32_i16x16(a0),
-            self.reinterpret_u32_i16x16(a1),
+            self.reinterpret_u32_i8x32(a0),
+            self.reinterpret_u32_i8x32(a1),
         )
     }
     #[inline(always)]
-    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
-        let half = self.splat_u16x16(val);
-        self.combine_u16x16(half, half)
+    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
+        let half = self.splat_u8x32(val);
+        self.combine_u8x32(half, half)
     }
     #[inline(always)]
-    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0)
+    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0)
+    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u8; 64usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        if SHIFT >= 64usize {
             return b;
         }
         let result = cross_block_alignr_128x4(
             self,
-            self.cvt_to_bytes_u16x32(b).val.0,
-            self.cvt_to_bytes_u16x32(a).val.0,
-            SHIFT * 2usize,
+            self.cvt_to_bytes_u8x64(b).val.0,
+            self.cvt_to_bytes_u8x64(a).val.0,
+            SHIFT,
         );
-        self.cvt_from_bytes_u16x32(u8x64 {
+        self.cvt_from_bytes_u8x64(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+    fn slide_within_blocks_u8x64<const SHIFT: usize>(
         self,
-        a: u16x32<Self>,
-        b: u16x32<Self>,
-    ) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
+        a: u8x64<Self>,
+        b: u8x64<Self>,
+    ) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(
+            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
+    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
+    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
+    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
+    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
+    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
+    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
+    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
     }
     #[inline(always)]
-    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
+    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
+    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
+    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
+    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
+    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
+    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
+    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
+    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
+    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, _) = self.split_u16x32(a);
-        let (b0, _) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
+    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, _) = self.split_u8x64(a);
+        let (b0, _) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (_, a1) = self.split_u16x32(a);
-        let (_, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (_, a1) = self.split_u8x64(a);
+        let (_, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.unzip_high_u16x16(a0, a1),
-            self.unzip_high_u16x16(b0, b1),
-        )
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_lo = self.zip_low_u16x16(a0, b0);
-        let lo_hi = self.zip_high_u16x16(a0, b0);
-        let hi_lo = self.zip_low_u16x16(a1, b1);
-        let hi_hi = self.zip_high_u16x16(a1, b1);
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_lo = self.zip_low_u8x32(a0, b0);
+        let lo_hi = self.zip_high_u8x32(a0, b0);
+        let hi_lo = self.zip_low_u8x32(a1, b1);
+        let hi_hi = self.zip_high_u8x32(a1, b1);
         (
-            self.combine_u16x16(lo_lo, lo_hi),
-            self.combine_u16x16(hi_lo, hi_hi),
+            self.combine_u8x32(lo_lo, lo_hi),
+            self.combine_u8x32(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_even = self.unzip_low_u16x16(a0, a1);
-        let lo_odd = self.unzip_high_u16x16(a0, a1);
-        let hi_even = self.unzip_low_u16x16(b0, b1);
-        let hi_odd = self.unzip_high_u16x16(b0, b1);
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_even = self.unzip_low_u8x32(a0, a1);
+        let lo_odd = self.unzip_high_u8x32(a0, a1);
+        let hi_even = self.unzip_low_u8x32(b0, b1);
+        let hi_odd = self.unzip_high_u8x32(b0, b1);
         (
-            self.combine_u16x16(lo_even, hi_even),
-            self.combine_u16x16(lo_odd, hi_odd),
+            self.combine_u8x32(lo_even, hi_even),
+            self.combine_u8x32(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let (c0, c1) = self.split_u16x32(c);
-        self.combine_u16x16(
-            self.select_u16x16(a0, b0, c0),
-            self.select_u16x16(a1, b1, c1),
-        )
+    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let (c0, c1) = self.split_u8x64(c);
+        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
+    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
+    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
     }
     #[inline(always)]
-    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
         (
-            u16x16 {
+            u8x32 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            u16x16 {
+            u8x32 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, src: &[u16; 32usize]) -> u16x32<Sse4_2> {
-                let (chunks, []) = src.as_chunks::<8usize>() else {
+            fn kernel(token: Sse4_2, src: &[u8; 64usize]) -> u8x64<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<16usize>() else {
                     unreachable!()
                 };
                 let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
                 let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
                 let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
                 let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
-                let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
                 let v0 = _mm_shuffle_epi8(v0, mask);
                 let v1 = _mm_shuffle_epi8(v1, mask);
                 let v2 = _mm_shuffle_epi8(v2, mask);
@@ -8476,22 +9214,22 @@ impl Simd for Sse4_2 {
                 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
                 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
                 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                token.combine_u16x16(
-                    token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)),
+                token.combine_u8x32(
+                    token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)),
                 )
             }
         );
         kernel(self, src)
     }
     #[inline(always)]
-    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: u16x32<Sse4_2>, dest: &mut [u16; 32usize]) -> () {
-                let (v01, v23) = token.split_u16x32(a);
-                let (v0, v1) = token.split_u16x16(v01);
-                let (v2, v3) = token.split_u16x16(v23);
+            fn kernel(token: Sse4_2, a: u8x64<Sse4_2>, dest: &mut [u8; 64usize]) -> () {
+                let (v01, v23) = token.split_u8x64(a);
+                let (v0, v1) = token.split_u8x32(v01);
+                let (v2, v3) = token.split_u8x32(v23);
                 let v0 = v0.into();
                 let v1 = v1.into();
                 let v2 = v2.into();
@@ -8504,27 +9242,27 @@ impl Simd for Sse4_2 {
                 let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
                 let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
                 let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
                 let out0 = _mm_shuffle_epi8(out0, mask);
                 let out1 = _mm_shuffle_epi8(out1, mask);
                 let out2 = _mm_shuffle_epi8(out2, mask);
                 let out3 = _mm_shuffle_epi8(out3, mask);
-                let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+                let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
                     unreachable!()
                 };
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out0,
                     &mut chunks[0],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out1,
                     &mut chunks[1],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out2,
                     &mut chunks[2],
                 );
-                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
                     out3,
                     &mut chunks[3],
                 );
@@ -8533,1201 +9271,2595 @@ impl Simd for Sse4_2 {
         kernel(self, a, dest);
     }
     #[inline(always)]
-    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_u16x16(a0),
-            self.reinterpret_u8_u16x16(a1),
+    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u8x32(a0),
+            self.reinterpret_u32_u8x32(a1),
         )
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_u16x16(a0),
-            self.reinterpret_u32_u16x16(a1),
-        )
+    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
+        let half = self.splat_mask8x32(val);
+        self.combine_mask8x32(half, half)
     }
     #[inline(always)]
-    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
-        let half = self.splat_mask16x16(val);
-        self.combine_mask16x16(half, half)
-    }
-    #[inline(always)]
-    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
-        mask16x32 {
+    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
+        mask8x64 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
-        let lo = self.from_bitmask_mask16x16(bits);
-        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
-        self.combine_mask16x16(lo, hi)
+    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i8; 64usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
         crate::kernel!(
             #[inline(always)]
-            fn kernel(token: Sse4_2, a: mask16x32<Sse4_2>) -> u64 {
+            fn kernel(token: Sse4_2, bits: u64) -> mask8x64<Sse4_2> {
                 {
-                    let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
-                    let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]);
-                    let lo = _mm_movemask_epi8(lo) as u32 as u64;
-                    let hi = _mm_movemask_epi8(hi) as u32 as u64;
-                    lo | (hi << 16usize)
+                    let bit_bytes = _mm_set1_epi64x(bits.cast_signed());
+                    let bit_mask =
+                        _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128);
+                    mask8x64 {
+                        val: crate::support::Aligned512([
+                            {
+                                let bit_bytes = _mm_shuffle_epi8(
+                                    bit_bytes,
+                                    _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1),
+                                );
+                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                            },
+                            {
+                                let bit_bytes = _mm_shuffle_epi8(
+                                    bit_bytes,
+                                    _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3),
+                                );
+                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                            },
+                            {
+                                let bit_bytes = _mm_shuffle_epi8(
+                                    bit_bytes,
+                                    _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5),
+                                );
+                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                            },
+                            {
+                                let bit_bytes = _mm_shuffle_epi8(
+                                    bit_bytes,
+                                    _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7),
+                                );
+                                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                            },
+                        ]),
+                        simd: token,
+                    }
                 }
             }
         );
-        kernel(self, a)
+        kernel(self, bits)
     }
     #[inline(always)]
-    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 32usize,
+            index < 64usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            32usize
+            64usize
         );
-        let mut lanes = self.as_array_mask16x32(*a);
+        let mut lanes = self.as_array_mask8x64(*a);
         lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x32(lanes);
+        *a = self.load_array_mask8x64(lanes);
     }
     #[inline(always)]
-    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
+    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
+    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
+    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
+    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
     }
     #[inline(always)]
-    fn select_mask16x32(
+    fn select_mask8x64(
         self,
-        a: mask16x32<Self>,
-        b: mask16x32<Self>,
-        c: mask16x32<Self>,
-    ) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        let (c0, c1) = self.split_mask16x32(c);
-        self.combine_mask16x16(
-            self.select_mask16x16(a0, b0, c0),
-            self.select_mask16x16(a1, b1, c1),
+        a: mask8x64<Self>,
+        b: mask8x64<Self>,
+        c: mask8x64<Self>,
+    ) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        let (c0, c1) = self.split_mask8x64(c);
+        self.combine_mask8x32(
+            self.select_mask8x32(a0, b0, c0),
+            self.select_mask8x32(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
+    }
+    #[inline(always)]
+    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
+    }
+    #[inline(always)]
+    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
+    }
+    #[inline(always)]
+    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+        (
+            mask8x32 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            mask8x32 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
+        let half = self.splat_i16x16(val);
+        self.combine_i16x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i16; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
+        i16x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        if SHIFT >= 32usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_i16x32(b).val.0,
+            self.cvt_to_bytes_i16x32(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+        self,
+        a: i16x32<Self>,
+        b: i16x32<Self>,
+    ) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(
-            self.simd_eq_mask16x16(a0, b0),
-            self.simd_eq_mask16x16(a1, b1),
+    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
+    }
+    #[inline(always)]
+    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, _) = self.split_i16x32(a);
+        let (b0, _) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (_, a1) = self.split_i16x32(a);
+        let (_, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.unzip_high_i16x16(a0, a1),
+            self.unzip_high_i16x16(b0, b1),
+        )
+    }
+    #[inline(always)]
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_lo = self.zip_low_i16x16(a0, b0);
+        let lo_hi = self.zip_high_i16x16(a0, b0);
+        let hi_lo = self.zip_low_i16x16(a1, b1);
+        let hi_hi = self.zip_high_i16x16(a1, b1);
+        (
+            self.combine_i16x16(lo_lo, lo_hi),
+            self.combine_i16x16(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_even = self.unzip_low_i16x16(a0, a1);
+        let lo_odd = self.unzip_high_i16x16(a0, a1);
+        let hi_even = self.unzip_low_i16x16(b0, b1);
+        let hi_odd = self.unzip_high_i16x16(b0, b1);
+        (
+            self.combine_i16x16(lo_even, hi_even),
+            self.combine_i16x16(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let (c0, c1) = self.split_i16x32(c);
+        self.combine_i16x16(
+            self.select_i16x16(a0, b0, c0),
+            self.select_i16x16(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        (
+            i16x16 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            i16x16 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_i16x16(a0),
+            self.reinterpret_u8_i16x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i16x16(a0),
+            self.reinterpret_u32_i16x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
+        let half = self.splat_u16x16(val);
+        self.combine_u16x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u16; 32usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
+        u16x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        if SHIFT >= 32usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_u16x32(b).val.0,
+            self.cvt_to_bytes_u16x32(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+        self,
+        a: u16x32<Self>,
+        b: u16x32<Self>,
+    ) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
+    }
+    #[inline(always)]
+    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, _) = self.split_u16x32(a);
+        let (b0, _) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (_, a1) = self.split_u16x32(a);
+        let (_, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.unzip_high_u16x16(a0, a1),
+            self.unzip_high_u16x16(b0, b1),
+        )
+    }
+    #[inline(always)]
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_lo = self.zip_low_u16x16(a0, b0);
+        let lo_hi = self.zip_high_u16x16(a0, b0);
+        let hi_lo = self.zip_low_u16x16(a1, b1);
+        let hi_hi = self.zip_high_u16x16(a1, b1);
+        (
+            self.combine_u16x16(lo_lo, lo_hi),
+            self.combine_u16x16(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_even = self.unzip_low_u16x16(a0, a1);
+        let lo_odd = self.unzip_high_u16x16(a0, a1);
+        let hi_even = self.unzip_low_u16x16(b0, b1);
+        let hi_odd = self.unzip_high_u16x16(b0, b1);
+        (
+            self.combine_u16x16(lo_even, hi_even),
+            self.combine_u16x16(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let (c0, c1) = self.split_u16x32(c);
+        self.combine_u16x16(
+            self.select_u16x16(a0, b0, c0),
+            self.select_u16x16(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        (
+            u16x16 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            u16x16 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[u16; 32usize]) -> u16x32<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<8usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+                let v0 = _mm_shuffle_epi8(v0, mask);
+                let v1 = _mm_shuffle_epi8(v1, mask);
+                let v2 = _mm_shuffle_epi8(v2, mask);
+                let v3 = _mm_shuffle_epi8(v3, mask);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u16x16(
+                    token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u16x32<Sse4_2>, dest: &mut [u16; 32usize]) -> () {
+                let (v01, v23) = token.split_u16x32(a);
+                let (v0, v1) = token.split_u16x16(v01);
+                let (v2, v3) = token.split_u16x16(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+                let out0 = _mm_shuffle_epi8(out0, mask);
+                let out1 = _mm_shuffle_epi8(out1, mask);
+                let out2 = _mm_shuffle_epi8(out2, mask);
+                let out3 = _mm_shuffle_epi8(out3, mask);
+                let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest);
+    }
+    #[inline(always)]
+    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_u16x16(a0),
+            self.reinterpret_u8_u16x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u16x16(a0),
+            self.reinterpret_u32_u16x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
+        let half = self.splat_mask16x16(val);
+        self.combine_mask16x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
+        mask16x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i16; 32usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: mask16x32<Sse4_2>) -> u64 {
+                {
+                    let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
+                    let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]);
+                    let lo = _mm_movemask_epi8(lo) as u32 as u64;
+                    let hi = _mm_movemask_epi8(hi) as u32 as u64;
+                    lo | (hi << 16usize)
+                }
+            }
+        );
+        kernel(self, a)
+    }
+    #[inline(always)]
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask16x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
+    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
+    }
+    #[inline(always)]
+    fn select_mask16x32(
+        self,
+        a: mask16x32<Self>,
+        b: mask16x32<Self>,
+        c: mask16x32<Self>,
+    ) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        let (c0, c1) = self.split_mask16x32(c);
+        self.combine_mask16x16(
+            self.select_mask16x16(a0, b0, c0),
+            self.select_mask16x16(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(
+            self.simd_eq_mask16x16(a0, b0),
+            self.simd_eq_mask16x16(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
+    }
+    #[inline(always)]
+    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
+    }
+    #[inline(always)]
+    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
+    }
+    #[inline(always)]
+    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+        (
+            mask16x16 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            mask16x16 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
+        let half = self.splat_i32x8(val);
+        self.combine_i32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
+        i32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_i32x16(b).val.0,
+            self.cvt_to_bytes_i32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i32x16<const SHIFT: usize>(
+        self,
+        a: i32x16<Self>,
+        b: i32x16<Self>,
+    ) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(
+            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
+    }
+    #[inline(always)]
+    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, _) = self.split_i32x16(a);
+        let (b0, _) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (_, a1) = self.split_i32x16(a);
+        let (_, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_lo = self.zip_low_i32x8(a0, b0);
+        let lo_hi = self.zip_high_i32x8(a0, b0);
+        let hi_lo = self.zip_low_i32x8(a1, b1);
+        let hi_hi = self.zip_high_i32x8(a1, b1);
+        (
+            self.combine_i32x8(lo_lo, lo_hi),
+            self.combine_i32x8(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_even = self.unzip_low_i32x8(a0, a1);
+        let lo_odd = self.unzip_high_i32x8(a0, a1);
+        let hi_even = self.unzip_low_i32x8(b0, b1);
+        let hi_odd = self.unzip_high_i32x8(b0, b1);
+        (
+            self.combine_i32x8(lo_even, hi_even),
+            self.combine_i32x8(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let (c0, c1) = self.split_i32x16(c);
+        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        (
+            i32x8 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            i32x8 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i32x8(a0),
+            self.reinterpret_u32_i32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
+    }
+    #[inline(always)]
+    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+        let half = self.splat_u32x8(val);
+        self.combine_u32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
+        u32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_u32x16(b).val.0,
+            self.cvt_to_bytes_u32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u32x16<const SHIFT: usize>(
+        self,
+        a: u32x16<Self>,
+        b: u32x16<Self>,
+    ) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(
+            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
+    }
+    #[inline(always)]
+    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, _) = self.split_u32x16(a);
+        let (b0, _) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (_, a1) = self.split_u32x16(a);
+        let (_, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_lo = self.zip_low_u32x8(a0, b0);
+        let lo_hi = self.zip_high_u32x8(a0, b0);
+        let hi_lo = self.zip_low_u32x8(a1, b1);
+        let hi_hi = self.zip_high_u32x8(a1, b1);
+        (
+            self.combine_u32x8(lo_lo, lo_hi),
+            self.combine_u32x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
+    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_even = self.unzip_low_u32x8(a0, a1);
+        let lo_odd = self.unzip_high_u32x8(a0, a1);
+        let hi_even = self.unzip_low_u32x8(b0, b1);
+        let hi_odd = self.unzip_high_u32x8(b0, b1);
+        (
+            self.combine_u32x8(lo_even, hi_even),
+            self.combine_u32x8(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
+    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let (c0, c1) = self.split_u32x16(c);
+        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
+    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
+    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
         (
-            mask16x16 {
+            u32x8 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            mask16x16 {
+            u32x8 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
-        let half = self.splat_i32x8(val);
-        self.combine_i32x8(half, half)
+    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[u32; 16usize]) -> u32x16<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u32x8(
+                    token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
-    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u32x16<Sse4_2>, dest: &mut [u32; 16usize]) -> () {
+                let (v01, v23) = token.split_u32x16(a);
+                let (v0, v1) = token.split_u32x8(v01);
+                let (v2, v3) = token.split_u32x8(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest);
     }
     #[inline(always)]
-    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
-            val: crate::transmute::checked_transmute_copy(val),
+    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
+    }
+    #[inline(always)]
+    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
+    }
+    #[inline(always)]
+    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
+        let half = self.splat_mask32x8(val);
+        self.combine_mask32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
+        mask32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
         crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0)
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
     }
     #[inline(always)]
-    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i32; 16usize]>(&mut a.val.0)
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
     }
     #[inline(always)]
-    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask32x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x16(lanes);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
-        i32x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
-        u8x64 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_alignr_128x4(
-            self,
-            self.cvt_to_bytes_i32x16(b).val.0,
-            self.cvt_to_bytes_i32x16(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_i32x16(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
+    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x16<const SHIFT: usize>(
-        self,
-        a: i32x16<Self>,
-        b: i32x16<Self>,
-    ) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(
-            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
-        )
+    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
     }
     #[inline(always)]
-    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
+    fn select_mask32x16(
+        self,
+        a: mask32x16<Self>,
+        b: mask32x16<Self>,
+        c: mask32x16<Self>,
+    ) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        let (c0, c1) = self.split_mask32x16(c);
+        self.combine_mask32x8(
+            self.select_mask32x8(a0, b0, c0),
+            self.select_mask32x8(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
+    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
+    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
+    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
+    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
+    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
+    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
+        (
+            mask32x8 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            mask32x8 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
+    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
+        let half = self.splat_f64x4(val);
+        self.combine_f64x4(half, half)
     }
     #[inline(always)]
-    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
+    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
+    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
+    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
+    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
+    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m128d; 4usize], [f64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
+    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
+    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
+        f64x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
+    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, _) = self.split_i32x16(a);
-        let (b0, _) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
+    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_f64x8(b).val.0,
+            self.cvt_to_bytes_f64x8(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (_, a1) = self.split_i32x16(a);
-        let (_, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
+    fn slide_within_blocks_f64x8<const SHIFT: usize>(
+        self,
+        a: f64x8<Self>,
+        b: f64x8<Self>,
+    ) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
+        )
     }
     #[inline(always)]
-    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
+    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
     }
     #[inline(always)]
-    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
+    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
     }
     #[inline(always)]
-    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_lo = self.zip_low_i32x8(a0, b0);
-        let lo_hi = self.zip_high_i32x8(a0, b0);
-        let hi_lo = self.zip_low_i32x8(a1, b1);
-        let hi_hi = self.zip_high_i32x8(a1, b1);
-        (
-            self.combine_i32x8(lo_lo, lo_hi),
-            self.combine_i32x8(hi_lo, hi_hi),
-        )
+    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
-    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_even = self.unzip_low_i32x8(a0, a1);
-        let lo_odd = self.unzip_high_i32x8(a0, a1);
-        let hi_even = self.unzip_low_i32x8(b0, b1);
-        let hi_odd = self.unzip_high_i32x8(b0, b1);
-        (
-            self.combine_i32x8(lo_even, hi_even),
-            self.combine_i32x8(lo_odd, hi_odd),
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
         )
     }
     #[inline(always)]
-    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let (c0, c1) = self.split_i32x16(c);
-        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
+    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
+    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
+    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        (
-            i32x8 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
-                simd: self,
-            },
-            i32x8 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
-                simd: self,
-            },
-        )
+    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
+    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
+    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_i32x8(a0),
-            self.reinterpret_u32_i32x8(a1),
-        )
+    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
+    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
-        let half = self.splat_u32x8(val);
-        self.combine_u32x8(half, half)
+    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, _) = self.split_f64x8(a);
+        let (b0, _) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
     }
     #[inline(always)]
-    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0)
+    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (_, a1) = self.split_f64x8(a);
+        let (_, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
-        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u32; 16usize]>(&a.val.0)
+    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
     }
     #[inline(always)]
-    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
-        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u32; 16usize]>(&mut a.val.0)
+    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
     }
     #[inline(always)]
-    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_lo = self.zip_low_f64x4(a0, b0);
+        let lo_hi = self.zip_high_f64x4(a0, b0);
+        let hi_lo = self.zip_low_f64x4(a1, b1);
+        let hi_hi = self.zip_high_f64x4(a1, b1);
+        (
+            self.combine_f64x4(lo_lo, lo_hi),
+            self.combine_f64x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
-        u32x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_even = self.unzip_low_f64x4(a0, a1);
+        let lo_odd = self.unzip_high_f64x4(a0, a1);
+        let hi_even = self.unzip_low_f64x4(b0, b1);
+        let hi_odd = self.unzip_high_f64x4(b0, b1);
+        (
+            self.combine_f64x4(lo_even, hi_even),
+            self.combine_f64x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
-        u8x64 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_alignr_128x4(
-            self,
-            self.cvt_to_bytes_u32x16(b).val.0,
-            self.cvt_to_bytes_u32x16(a).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_u32x16(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
+    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x16<const SHIFT: usize>(
-        self,
-        a: u32x16<Self>,
-        b: u32x16<Self>,
-    ) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(
-            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
+    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.max_precise_f64x4(a0, b0),
+            self.max_precise_f64x4(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
+    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.min_precise_f64x4(a0, b0),
+            self.min_precise_f64x4(a1, b1),
+        )
     }
     #[inline(always)]
-    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
+    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_add_f64x4(a0, b0, c0),
+            self.mul_add_f64x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
+    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_sub_f64x4(a0, b0, c0),
+            self.mul_sub_f64x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
+    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
     }
     #[inline(always)]
-    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
+    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
     }
     #[inline(always)]
-    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
+    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.round_ties_even_f64x4(a0),
+            self.round_ties_even_f64x4(a1),
+        )
     }
     #[inline(always)]
-    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
+    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
     }
     #[inline(always)]
-    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
+    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
     }
     #[inline(always)]
-    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
+    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
+    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        (
+            f64x4 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            f64x4 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
+    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x8(
+            self.reinterpret_f32_f64x4(a0),
+            self.reinterpret_f32_f64x4(a1),
+        )
     }
     #[inline(always)]
-    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
+    fn splat_i64x8(self, val: i64) -> i64x8<Self> {
+        let half = self.splat_i64x4(val);
+        self.combine_i64x4(half, half)
     }
     #[inline(always)]
-    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
+    fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
+    fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
+    fn as_array_i64x8(self, a: i64x8<Self>) -> [i64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
+    fn as_array_ref_i64x8(self, a: &i64x8<Self>) -> &[i64; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, _) = self.split_u32x16(a);
-        let (b0, _) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
+    fn as_array_mut_i64x8(self, a: &mut i64x8<Self>) -> &mut [i64; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [i64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (_, a1) = self.split_u32x16(a);
-        let (_, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
+    fn store_array_i64x8(self, a: i64x8<Self>, dest: &mut [i64; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
+    fn cvt_from_bytes_i64x8(self, a: u8x64<Self>) -> i64x8<Self> {
+        i64x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
+    fn cvt_to_bytes_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_lo = self.zip_low_u32x8(a0, b0);
-        let lo_hi = self.zip_high_u32x8(a0, b0);
-        let hi_lo = self.zip_low_u32x8(a1, b1);
-        let hi_hi = self.zip_high_u32x8(a1, b1);
-        (
-            self.combine_u32x8(lo_lo, lo_hi),
-            self.combine_u32x8(hi_lo, hi_hi),
-        )
+    fn slide_i64x8<const SHIFT: usize>(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_i64x8(b).val.0,
+            self.cvt_to_bytes_i64x8(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_even = self.unzip_low_u32x8(a0, a1);
-        let lo_odd = self.unzip_high_u32x8(a0, a1);
-        let hi_even = self.unzip_low_u32x8(b0, b1);
-        let hi_odd = self.unzip_high_u32x8(b0, b1);
-        (
-            self.combine_u32x8(lo_even, hi_even),
-            self.combine_u32x8(lo_odd, hi_odd),
+    fn slide_within_blocks_i64x8<const SHIFT: usize>(
+        self,
+        a: i64x8<Self>,
+        b: i64x8<Self>,
+    ) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(
+            self.slide_within_blocks_i64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let (c0, c1) = self.split_u32x16(c);
-        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
+    fn add_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
+    fn sub_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
+    fn mul_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        (
-            u32x8 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
-                simd: self,
-            },
-            u32x8 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
-                simd: self,
-            },
-        )
+    fn and_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, src: &[u32; 16usize]) -> u32x16<Sse4_2> {
-                let (chunks, []) = src.as_chunks::<4usize>() else {
-                    unreachable!()
-                };
-                let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
-                let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
-                let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
-                let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
-                let tmp0 = _mm_unpacklo_epi32(v0, v1);
-                let tmp1 = _mm_unpackhi_epi32(v0, v1);
-                let tmp2 = _mm_unpacklo_epi32(v2, v3);
-                let tmp3 = _mm_unpackhi_epi32(v2, v3);
-                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                token.combine_u32x8(
-                    token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)),
-                )
-            }
-        );
-        kernel(self, src)
+    fn or_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        crate::kernel!(
-            #[inline(always)]
-            fn kernel(token: Sse4_2, a: u32x16<Sse4_2>, dest: &mut [u32; 16usize]) -> () {
-                let (v01, v23) = token.split_u32x16(a);
-                let (v0, v1) = token.split_u32x8(v01);
-                let (v2, v3) = token.split_u32x8(v23);
-                let v0 = v0.into();
-                let v1 = v1.into();
-                let v2 = v2.into();
-                let v3 = v3.into();
-                let tmp0 = _mm_unpacklo_epi32(v0, v1);
-                let tmp1 = _mm_unpackhi_epi32(v0, v1);
-                let tmp2 = _mm_unpacklo_epi32(v2, v3);
-                let tmp3 = _mm_unpackhi_epi32(v2, v3);
-                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
-                    unreachable!()
-                };
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out0,
-                    &mut chunks[0],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out1,
-                    &mut chunks[1],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out2,
-                    &mut chunks[2],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
-                    out3,
-                    &mut chunks[3],
-                );
-            }
-        );
-        kernel(self, a, dest);
+    fn xor_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
+    fn not_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1))
     }
     #[inline(always)]
-    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
+    fn shl_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
-        let half = self.splat_mask32x8(val);
-        self.combine_mask32x8(half, half)
+    fn shlv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shr_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [i32; 16usize]>(&a.val.0)
+    fn shrv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
-        let lo = self.from_bitmask_mask32x8(bits);
-        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
-        self.combine_mask32x8(lo, hi)
+    fn simd_eq_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
-        let (lo, hi) = self.split_mask32x16(a);
-        let lo = self.to_bitmask_mask32x8(lo);
-        let hi = self.to_bitmask_mask32x8(hi);
-        lo | (hi << 8usize)
+    fn simd_lt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let mut lanes = self.as_array_mask32x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x16(lanes);
+    fn simd_le_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
+    fn simd_ge_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
+    fn simd_gt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
+    fn zip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, _) = self.split_i64x8(a);
+        let (b0, _) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0))
     }
     #[inline(always)]
-    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
+    fn zip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (_, a1) = self.split_i64x8(a);
+        let (_, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_mask32x16(
-        self,
-        a: mask32x16<Self>,
-        b: mask32x16<Self>,
-        c: mask32x16<Self>,
-    ) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        let (c0, c1) = self.split_mask32x16(c);
-        self.combine_mask32x8(
-            self.select_mask32x8(a0, b0, c0),
-            self.select_mask32x8(a1, b1, c1),
-        )
+    fn unzip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
+    fn unzip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
+    fn interleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_lo = self.zip_low_i64x4(a0, b0);
+        let lo_hi = self.zip_high_i64x4(a0, b0);
+        let hi_lo = self.zip_low_i64x4(a1, b1);
+        let hi_hi = self.zip_high_i64x4(a1, b1);
+        (
+            self.combine_i64x4(lo_lo, lo_hi),
+            self.combine_i64x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
+    fn deinterleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_even = self.unzip_low_i64x4(a0, a1);
+        let lo_odd = self.unzip_high_i64x4(a0, a1);
+        let hi_even = self.unzip_low_i64x4(b0, b1);
+        let hi_odd = self.unzip_high_i64x4(b0, b1);
+        (
+            self.combine_i64x4(lo_even, hi_even),
+            self.combine_i64x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
+    fn select_i64x8(self, a: mask64x8<Self>, b: i64x8<Self>, c: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let (c0, c1) = self.split_i64x8(c);
+        self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
+    fn min_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
+    fn max_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn split_i64x8(self, a: i64x8<Self>) -> (i64x4<Self>, i64x4<Self>) {
         (
-            mask32x8 {
+            i64x4 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            mask32x8 {
+            i64x4 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
-        let half = self.splat_f64x4(val);
-        self.combine_f64x4(half, half)
+    fn neg_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1))
     }
     #[inline(always)]
-    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn reinterpret_u8_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x8(self, a: i64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i64x4(a0),
+            self.reinterpret_u32_i64x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u64x8(self, val: u64) -> u64x8<Self> {
+        let half = self.splat_u64x4(val);
+        self.combine_u64x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
-        crate::transmute::checked_transmute_copy::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0)
+    fn as_array_u64x8(self, a: u64x8<Self>) -> [u64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[__m128i; 4usize], [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
-        crate::transmute::checked_cast_ref::<[__m128d; 4usize], [f64; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x8(self, a: &u64x8<Self>) -> &[u64; 8usize] {
+        crate::transmute::checked_cast_ref::<[__m128i; 4usize], [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
-        crate::transmute::checked_cast_mut::<[__m128d; 4usize], [f64; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x8(self, a: &mut u64x8<Self>) -> &mut [u64; 8usize] {
+        crate::transmute::checked_cast_mut::<[__m128i; 4usize], [u64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+    fn store_array_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn cvt_from_bytes_u64x8(self, a: u8x64<Self>) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn slide_u64x8<const SHIFT: usize>(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_alignr_128x4(
             self,
-            self.cvt_to_bytes_f64x8(b).val.0,
-            self.cvt_to_bytes_f64x8(a).val.0,
+            self.cvt_to_bytes_u64x8(b).val.0,
+            self.cvt_to_bytes_u64x8(a).val.0,
             SHIFT * 8usize,
         );
-        self.cvt_from_bytes_f64x8(u8x64 {
+        self.cvt_from_bytes_u64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x8<const SHIFT: usize>(
         self,
-        a: f64x8<Self>,
-        b: f64x8<Self>,
-    ) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
-    }
-    #[inline(always)]
-    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
-    }
-    #[inline(always)]
-    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
-    }
-    #[inline(always)]
-    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.approximate_recip_f64x4(a0),
-            self.approximate_recip_f64x4(a1),
+        a: u64x8<Self>,
+        b: u64x8<Self>,
+    ) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(
+            self.slide_within_blocks_u64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
+    fn add_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
+    fn sub_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
+    fn mul_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
+    fn and_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
+    fn or_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
+    fn xor_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
+    fn not_u64x8(self, a: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1))
     }
     #[inline(always)]
-    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
+    fn shl_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
+    fn shlv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, _) = self.split_f64x8(a);
-        let (b0, _) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
+    fn shr_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (_, a1) = self.split_f64x8(a);
-        let (_, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
+    fn shrv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
+    fn simd_eq_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
+    fn simd_lt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_lo = self.zip_low_f64x4(a0, b0);
-        let lo_hi = self.zip_high_f64x4(a0, b0);
-        let hi_lo = self.zip_low_f64x4(a1, b1);
-        let hi_hi = self.zip_high_f64x4(a1, b1);
-        (
-            self.combine_f64x4(lo_lo, lo_hi),
-            self.combine_f64x4(hi_lo, hi_hi),
-        )
+    fn simd_le_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_even = self.unzip_low_f64x4(a0, a1);
-        let lo_odd = self.unzip_high_f64x4(a0, a1);
-        let hi_even = self.unzip_low_f64x4(b0, b1);
-        let hi_odd = self.unzip_high_f64x4(b0, b1);
-        (
-            self.combine_f64x4(lo_even, hi_even),
-            self.combine_f64x4(lo_odd, hi_odd),
-        )
+    fn simd_ge_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
+    fn simd_gt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
+    fn zip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, _) = self.split_u64x8(a);
+        let (b0, _) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0))
     }
     #[inline(always)]
-    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.max_precise_f64x4(a0, b0),
-            self.max_precise_f64x4(a1, b1),
-        )
+    fn zip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (_, a1) = self.split_u64x8(a);
+        let (_, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.min_precise_f64x4(a0, b0),
-            self.min_precise_f64x4(a1, b1),
-        )
+    fn unzip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_add_f64x4(a0, b0, c0),
-            self.mul_add_f64x4(a1, b1, c1),
-        )
+    fn unzip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_sub_f64x4(a0, b0, c0),
-            self.mul_sub_f64x4(a1, b1, c1),
+    fn interleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_lo = self.zip_low_u64x4(a0, b0);
+        let lo_hi = self.zip_high_u64x4(a0, b0);
+        let hi_lo = self.zip_low_u64x4(a1, b1);
+        let hi_hi = self.zip_high_u64x4(a1, b1);
+        (
+            self.combine_u64x4(lo_lo, lo_hi),
+            self.combine_u64x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
-    }
-    #[inline(always)]
-    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
-    }
-    #[inline(always)]
-    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.round_ties_even_f64x4(a0),
-            self.round_ties_even_f64x4(a1),
+    fn deinterleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_even = self.unzip_low_u64x4(a0, a1);
+        let lo_odd = self.unzip_high_u64x4(a0, a1);
+        let hi_even = self.unzip_low_u64x4(b0, b1);
+        let hi_odd = self.unzip_high_u64x4(b0, b1);
+        (
+            self.combine_u64x4(lo_even, hi_even),
+            self.combine_u64x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
+    fn select_u64x8(self, a: mask64x8<Self>, b: u64x8<Self>, c: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let (c0, c1) = self.split_u64x8(c);
+        self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
+    fn min_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_mask64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
+    fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>) {
         (
-            f64x4 {
+            u64x4 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            f64x4 {
+            u64x4 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f32x8(
-            self.reinterpret_f32_f64x4(a0),
-            self.reinterpret_f32_f64x4(a1),
+    fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self> {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[u64; 8usize]) -> u64x8<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<2usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]);
+                let out0 = _mm_unpacklo_epi64(v0, v1);
+                let out1 = _mm_unpacklo_epi64(v2, v3);
+                let out2 = _mm_unpackhi_epi64(v0, v1);
+                let out3 = _mm_unpackhi_epi64(v2, v3);
+                token.combine_u64x4(
+                    token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u64x8<Sse4_2>, dest: &mut [u64; 8usize]) -> () {
+                let (v01, v23) = token.split_u64x8(a);
+                let (v0, v1) = token.split_u64x4(v01);
+                let (v2, v3) = token.split_u64x4(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let out0 = _mm_unpacklo_epi64(v0, v2);
+                let out1 = _mm_unpackhi_epi64(v0, v2);
+                let out2 = _mm_unpacklo_epi64(v1, v3);
+                let out3 = _mm_unpackhi_epi64(v1, v3);
+                let (chunks, []) = dest.as_chunks_mut::<2usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest);
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x8(self, a: u64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u64x4(a0),
+            self.reinterpret_u32_u64x4(a1),
         )
     }
     #[inline(always)]
@@ -10010,6 +12142,36 @@ impl<S: Simd> From<f64x2<S>> for __m128d {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
+impl<S: Simd> SimdFrom<__m128i, S> for i64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i64x2<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: i64x2<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<__m128i, S> for u64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: __m128i) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u64x2<S>> for __m128i {
+    #[inline(always)]
+    fn from(value: u64x2<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
 impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: __m128i) -> Self {
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 09a2c0048..2c66ee1e1 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -6,9 +6,9 @@
 use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
-    i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
-    mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
-    u32x4, u32x8, u32x16,
+    i32x8, i32x16, i64x2, i64x4, i64x8, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16,
+    mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64,
+    u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8,
 };
 use core::arch::wasm32::*;
 #[doc = "A token for WASM SIMD128, representing the \"wasm128\" level."]
@@ -35,6 +35,8 @@ impl ArchTypes for WasmSimd128 {
     type u32x4 = crate::support::Aligned128<v128>;
     type mask32x4 = crate::support::Aligned128<v128>;
     type f64x2 = crate::support::Aligned128<v128>;
+    type i64x2 = crate::support::Aligned128<v128>;
+    type u64x2 = crate::support::Aligned128<v128>;
     type mask64x2 = crate::support::Aligned128<v128>;
     type f32x8 = crate::support::Aligned256<[v128; 2usize]>;
     type i8x32 = crate::support::Aligned256<[v128; 2usize]>;
@@ -47,6 +49,8 @@ impl ArchTypes for WasmSimd128 {
     type u32x8 = crate::support::Aligned256<[v128; 2usize]>;
     type mask32x8 = crate::support::Aligned256<[v128; 2usize]>;
     type f64x4 = crate::support::Aligned256<[v128; 2usize]>;
+    type i64x4 = crate::support::Aligned256<[v128; 2usize]>;
+    type u64x4 = crate::support::Aligned256<[v128; 2usize]>;
     type mask64x4 = crate::support::Aligned256<[v128; 2usize]>;
     type f32x16 = crate::support::Aligned512<[v128; 4usize]>;
     type i8x64 = crate::support::Aligned512<[v128; 4usize]>;
@@ -59,6 +63,8 @@ impl ArchTypes for WasmSimd128 {
     type u32x16 = crate::support::Aligned512<[v128; 4usize]>;
     type mask32x16 = crate::support::Aligned512<[v128; 4usize]>;
     type f64x8 = crate::support::Aligned512<[v128; 4usize]>;
+    type i64x8 = crate::support::Aligned512<[v128; 4usize]>;
+    type u64x8 = crate::support::Aligned512<[v128; 4usize]>;
     type mask64x8 = crate::support::Aligned512<[v128; 4usize]>;
 }
 impl Simd for WasmSimd128 {
@@ -70,6 +76,8 @@ impl Simd for WasmSimd128 {
     type i16s = i16x8<Self>;
     type u32s = u32x4<Self>;
     type i32s = i32x4<Self>;
+    type u64s = u64x2<Self>;
+    type i64s = i64x2<Self>;
     type mask8s = mask8x16<Self>;
     type mask16s = mask16x8<Self>;
     type mask32s = mask32x4<Self>;
@@ -487,7 +495,27 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i8; 16usize] = a.into();
+        let b: [i8; 16usize] = b.into();
+        let result: [i8; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
@@ -495,7 +523,27 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i8; 16usize] = a.into();
+        let b: [i8; 16usize] = b.into();
+        let result: [i8; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
@@ -700,7 +748,27 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u8; 16usize] = a.into();
+        let b: [u8; 16usize] = b.into();
+        let result: [u8; 16usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+            core::ops::Shl::shl(a[8usize], b[8usize]),
+            core::ops::Shl::shl(a[9usize], b[9usize]),
+            core::ops::Shl::shl(a[10usize], b[10usize]),
+            core::ops::Shl::shl(a[11usize], b[11usize]),
+            core::ops::Shl::shl(a[12usize], b[12usize]),
+            core::ops::Shl::shl(a[13usize], b[13usize]),
+            core::ops::Shl::shl(a[14usize], b[14usize]),
+            core::ops::Shl::shl(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
@@ -708,7 +776,27 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u8; 16usize] = a.into();
+        let b: [u8; 16usize] = b.into();
+        let result: [u8; 16usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+            core::ops::Shr::shr(a[8usize], b[8usize]),
+            core::ops::Shr::shr(a[9usize], b[9usize]),
+            core::ops::Shr::shr(a[10usize], b[10usize]),
+            core::ops::Shr::shr(a[11usize], b[11usize]),
+            core::ops::Shr::shr(a[12usize], b[12usize]),
+            core::ops::Shr::shr(a[13usize], b[13usize]),
+            core::ops::Shr::shr(a[14usize], b[14usize]),
+            core::ops::Shr::shr(a[15usize], b[15usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
@@ -1007,7 +1095,19 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i16; 8usize] = a.into();
+        let b: [i16; 8usize] = b.into();
+        let result: [i16; 8usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
@@ -1015,7 +1115,19 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i16; 8usize] = a.into();
+        let b: [i16; 8usize] = b.into();
+        let result: [i16; 8usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
@@ -1204,7 +1316,19 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u16; 8usize] = a.into();
+        let b: [u16; 8usize] = b.into();
+        let result: [u16; 8usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+            core::ops::Shl::shl(a[4usize], b[4usize]),
+            core::ops::Shl::shl(a[5usize], b[5usize]),
+            core::ops::Shl::shl(a[6usize], b[6usize]),
+            core::ops::Shl::shl(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
@@ -1212,7 +1336,19 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u16; 8usize] = a.into();
+        let b: [u16; 8usize] = b.into();
+        let result: [u16; 8usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+            core::ops::Shr::shr(a[4usize], b[4usize]),
+            core::ops::Shr::shr(a[5usize], b[5usize]),
+            core::ops::Shr::shr(a[6usize], b[6usize]),
+            core::ops::Shr::shr(a[7usize], b[7usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
@@ -1494,7 +1630,15 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [i32; 4usize] = a.into();
+        let b: [i32; 4usize] = b.into();
+        let result: [i32; 4usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
@@ -1502,7 +1646,15 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [i32; 4usize] = a.into();
+        let b: [i32; 4usize] = b.into();
+        let result: [i32; 4usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
@@ -1695,7 +1847,15 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
+        let a: [u32; 4usize] = a.into();
+        let b: [u32; 4usize] = b.into();
+        let result: [u32; 4usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+            core::ops::Shl::shl(a[2usize], b[2usize]),
+            core::ops::Shl::shl(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
@@ -1703,7 +1863,15 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
-        core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
+        let a: [u32; 4usize] = a.into();
+        let b: [u32; 4usize] = b.into();
+        let result: [u32; 4usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+            core::ops::Shr::shr(a[2usize], b[2usize]),
+            core::ops::Shr::shr(a[3usize], b[3usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
     fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
@@ -2133,66 +2301,178 @@ impl Simd for WasmSimd128 {
         <v128>::from(a).simd_into(self)
     }
     #[inline(always)]
-    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
-        let val: i64 = if val { !0 } else { 0 };
+    fn splat_i64x2(self, val: i64) -> i64x2<Self> {
         i64x2_splat(val).simd_into(self)
     }
     #[inline(always)]
-    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
-        mask64x2 {
+    fn load_array_i64x2(self, val: [i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+    fn load_array_ref_i64x2(self, val: &[i64; 2usize]) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i64x2(self, a: i64x2<Self>) -> [i64; 2usize] {
         crate::transmute::checked_transmute_copy::<v128, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
-        let bitset = i64x2_splat(bits as i64);
-        let powers = u64x2(1, 2);
-        let selected = v128_and(bitset, powers);
-        i64x2_ne(selected, i64x2_splat(0)).simd_into(self)
+    fn as_array_ref_i64x2(self, a: &i64x2<Self>) -> &[i64; 2usize] {
+        crate::transmute::checked_cast_ref::<v128, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
-        i64x2_bitmask(a.into()) as u64
+    fn as_array_mut_i64x2(self, a: &mut i64x2<Self>) -> &mut [i64; 2usize] {
+        crate::transmute::checked_cast_mut::<v128, [i64; 2usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 2usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            2usize
+    fn store_array_i64x2(self, a: i64x2<Self>, dest: &mut [i64; 2usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i64x2(self, a: u8x16<Self>) -> i64x2<Self> {
+        i64x2 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        u8x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x2<const SHIFT: usize>(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        if SHIFT >= 2usize {
+            return b;
+        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_i64x2(a).val.0,
+            self.cvt_to_bytes_i64x2(b).val.0,
+            SHIFT * 8usize,
         );
-        let mut lanes = self.as_array_mask64x2(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x2(lanes);
+        self.cvt_from_bytes_i64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn slide_within_blocks_i64x2<const SHIFT: usize>(
+        self,
+        a: i64x2<Self>,
+        b: i64x2<Self>,
+    ) -> i64x2<Self> {
+        self.slide_i64x2::<SHIFT>(a, b)
+    }
+    #[inline(always)]
+    fn add_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        i64x2_add(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn sub_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        i64x2_sub(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn mul_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        i64x2_mul(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn and_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn or_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         v128_or(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+    fn xor_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
         v128_xor(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+    fn not_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
         v128_not(a.into()).simd_into(self)
     }
     #[inline(always)]
-    fn select_mask64x2(
-        self,
-        a: mask64x2<Self>,
-        b: mask64x2<Self>,
-        c: mask64x2<Self>,
-    ) -> mask64x2<Self> {
+    fn shl_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        i64x2_shl(a.into(), shift).simd_into(self)
+    }
+    #[inline(always)]
+    fn shlv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn shr_i64x2(self, a: i64x2<Self>, shift: u32) -> i64x2<Self> {
+        i64x2_shr(a.into(), shift).simd_into(self)
+    }
+    #[inline(always)]
+    fn shrv_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_eq_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        i64x2_eq(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_lt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        i64x2_lt(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_le_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        i64x2_le(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_ge_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        i64x2_ge(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn simd_gt_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> mask64x2<Self> {
+        i64x2_gt(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn zip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn zip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn unzip_low_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn unzip_high_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self)
+    }
+    #[inline(always)]
+    fn interleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.zip_low_i64x2(a, b), self.zip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn deinterleave_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (self.unzip_low_i64x2(a, b), self.unzip_high_i64x2(a, b))
+    }
+    #[inline(always)]
+    fn select_i64x2(self, a: mask64x2<Self>, b: i64x2<Self>, c: i64x2<Self>) -> i64x2<Self> {
         #[cfg(target_feature = "relaxed-simd")]
         {
             i64x2_relaxed_laneselect(b.into(), c.into(), a.into()).simd_into(self)
@@ -2203,3166 +2483,4109 @@ impl Simd for WasmSimd128 {
         }
     }
     #[inline(always)]
-    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
-        i64x2_eq(a.into(), b.into()).simd_into(self)
+    fn min_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
-        v128_any_true(a.into())
+    fn max_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x2<Self> {
+        let a: [i64; 2usize] = a.into();
+        let b: [i64; 2usize] = b.into();
+        let result: [i64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
-        i64x2_all_true(a.into())
+    fn combine_i64x2(self, a: i64x2<Self>, b: i64x2<Self>) -> i64x4<Self> {
+        i64x4 {
+            val: crate::support::Aligned256([a.val.0, b.val.0]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
-        !i64x2_all_true(a.into())
+    fn neg_i64x2(self, a: i64x2<Self>) -> i64x2<Self> {
+        i64x2_neg(a.into()).simd_into(self)
     }
     #[inline(always)]
-    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
-        !v128_any_true(a.into())
+    fn reinterpret_u8_i64x2(self, a: i64x2<Self>) -> u8x16<Self> {
+        <v128>::from(a).simd_into(self)
     }
     #[inline(always)]
-    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
-        mask64x4 {
-            val: crate::support::Aligned256([a.val.0, b.val.0]),
-            simd: self,
-        }
+    fn reinterpret_u32_i64x2(self, a: i64x2<Self>) -> u32x4<Self> {
+        <v128>::from(a).simd_into(self)
     }
     #[inline(always)]
-    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
-        let half = self.splat_f32x4(val);
-        self.combine_f32x4(half, half)
+    fn splat_u64x2(self, val: u64) -> u64x2<Self> {
+        u64x2_splat(val).simd_into(self)
     }
     #[inline(always)]
-    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
+    fn load_array_u64x2(self, val: [u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
-        f32x8 {
+    fn load_array_ref_u64x2(self, val: &[u64; 2usize]) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [f32; 8usize]>(&a.val.0)
+    fn as_array_u64x2(self, a: u64x2<Self>) -> [u64; 2usize] {
+        crate::transmute::checked_transmute_copy::<v128, [u64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [f32; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x2(self, a: &u64x2<Self>) -> &[u64; 2usize] {
+        crate::transmute::checked_cast_ref::<v128, [u64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [f32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x2(self, a: &mut u64x2<Self>) -> &mut [u64; 2usize] {
+        crate::transmute::checked_cast_mut::<v128, [u64; 2usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
+    fn store_array_u64x2(self, a: u64x2<Self>, dest: &mut [u64; 2usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
-        f32x8 {
+    fn cvt_from_bytes_u64x2(self, a: u8x16<Self>) -> u64x2<Self> {
+        u64x2 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_to_bytes_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        u8x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        if SHIFT >= 8usize {
+    fn slide_u64x2<const SHIFT: usize>(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        if SHIFT >= 2usize {
             return b;
         }
-        let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_f32x8(a).val.0,
-            self.cvt_to_bytes_f32x8(b).val.0,
-            SHIFT * 4usize,
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_u64x2(a).val.0,
+            self.cvt_to_bytes_u64x2(b).val.0,
+            SHIFT * 8usize,
         );
-        self.cvt_from_bytes_f32x8(u8x32 {
-            val: crate::support::Aligned256(result),
+        self.cvt_from_bytes_u64x2(u8x16 {
+            val: crate::support::Aligned128(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x2<const SHIFT: usize>(
         self,
-        a: f32x8<Self>,
-        b: f32x8<Self>,
-    ) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
-        )
+        a: u64x2<Self>,
+        b: u64x2<Self>,
+    ) -> u64x2<Self> {
+        self.slide_u64x2::<SHIFT>(a, b)
     }
     #[inline(always)]
-    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
+    fn add_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        u64x2_add(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
+    fn sub_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        u64x2_sub(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
+    fn mul_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        u64x2_mul(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(
-            self.approximate_recip_f32x4(a0),
-            self.approximate_recip_f32x4(a1),
-        )
+    fn and_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        v128_and(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
+    fn or_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        v128_or(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
+    fn xor_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        v128_xor(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
+    fn not_u64x2(self, a: u64x2<Self>) -> u64x2<Self> {
+        v128_not(a.into()).simd_into(self)
     }
     #[inline(always)]
-    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
+    fn shl_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        u64x2_shl(a.into(), shift).simd_into(self)
     }
     #[inline(always)]
-    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
+    fn shlv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [
+            core::ops::Shl::shl(a[0usize], b[0usize]),
+            core::ops::Shl::shl(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
+    fn shr_u64x2(self, a: u64x2<Self>, shift: u32) -> u64x2<Self> {
+        u64x2_shr(a.into(), shift).simd_into(self)
     }
     #[inline(always)]
-    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
+    fn shrv_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [
+            core::ops::Shr::shr(a[0usize], b[0usize]),
+            core::ops::Shr::shr(a[1usize], b[1usize]),
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
+    fn simd_eq_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let true_lane: i64 = !0;
+        let false_lane: i64 = 0;
+        let result: [i64; 2usize] = [
+            if a[0usize] == b[0usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+            if a[1usize] == b[1usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
+    fn simd_lt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let true_lane: i64 = !0;
+        let false_lane: i64 = 0;
+        let result: [i64; 2usize] = [
+            if a[0usize] < b[0usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+            if a[1usize] < b[1usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
+    fn simd_le_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let true_lane: i64 = !0;
+        let false_lane: i64 = 0;
+        let result: [i64; 2usize] = [
+            if a[0usize] <= b[0usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+            if a[1usize] <= b[1usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, _) = self.split_f32x8(a);
-        let (b0, _) = self.split_f32x8(b);
-        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
+    fn simd_ge_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let true_lane: i64 = !0;
+        let false_lane: i64 = 0;
+        let result: [i64; 2usize] = [
+            if a[0usize] >= b[0usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+            if a[1usize] >= b[1usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (_, a1) = self.split_f32x8(a);
-        let (_, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
+    fn simd_gt_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> mask64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let true_lane: i64 = !0;
+        let false_lane: i64 = 0;
+        let result: [i64; 2usize] = [
+            if a[0usize] > b[0usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+            if a[1usize] > b[1usize] {
+                true_lane
+            } else {
+                false_lane
+            },
+        ];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
+    fn zip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
+    fn zip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let lo_lo = self.zip_low_f32x4(a0, b0);
-        let lo_hi = self.zip_high_f32x4(a0, b0);
-        let hi_lo = self.zip_low_f32x4(a1, b1);
-        let hi_hi = self.zip_high_f32x4(a1, b1);
-        (
-            self.combine_f32x4(lo_lo, lo_hi),
-            self.combine_f32x4(hi_lo, hi_hi),
-        )
+    fn unzip_low_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let lo_even = self.unzip_low_f32x4(a0, a1);
-        let lo_odd = self.unzip_high_f32x4(a0, a1);
-        let hi_even = self.unzip_low_f32x4(b0, b1);
-        let hi_odd = self.unzip_high_f32x4(b0, b1);
-        (
-            self.combine_f32x4(lo_even, hi_even),
-            self.combine_f32x4(lo_odd, hi_odd),
-        )
+    fn unzip_high_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
+    fn interleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.zip_low_u64x2(a, b), self.zip_high_u64x2(a, b))
     }
     #[inline(always)]
-    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
+    fn deinterleave_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (self.unzip_low_u64x2(a, b), self.unzip_high_u64x2(a, b))
     }
     #[inline(always)]
-    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.max_precise_f32x4(a0, b0),
-            self.max_precise_f32x4(a1, b1),
-        )
+    fn select_u64x2(self, a: mask64x2<Self>, b: u64x2<Self>, c: u64x2<Self>) -> u64x2<Self> {
+        #[cfg(target_feature = "relaxed-simd")]
+        {
+            i64x2_relaxed_laneselect(b.into(), c.into(), a.into()).simd_into(self)
+        }
+        #[cfg(not(target_feature = "relaxed-simd"))]
+        {
+            v128_bitselect(b.into(), c.into(), a.into()).simd_into(self)
+        }
     }
     #[inline(always)]
-    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        self.combine_f32x4(
-            self.min_precise_f32x4(a0, b0),
-            self.min_precise_f32x4(a1, b1),
-        )
+    fn min_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [a[0usize].min(b[0usize]), a[1usize].min(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(
-            self.mul_add_f32x4(a0, b0, c0),
-            self.mul_add_f32x4(a1, b1, c1),
-        )
+    fn max_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x2<Self> {
+        let a: [u64; 2usize] = a.into();
+        let b: [u64; 2usize] = b.into();
+        let result: [u64; 2usize] = [a[0usize].max(b[0usize]), a[1usize].max(b[1usize])];
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(
-            self.mul_sub_f32x4(a0, b0, c0),
-            self.mul_sub_f32x4(a1, b1, c1),
-        )
+    fn combine_u64x2(self, a: u64x2<Self>, b: u64x2<Self>) -> u64x4<Self> {
+        u64x4 {
+            val: crate::support::Aligned256([a.val.0, b.val.0]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
+    fn reinterpret_u8_u64x2(self, a: u64x2<Self>) -> u8x16<Self> {
+        <v128>::from(a).simd_into(self)
     }
     #[inline(always)]
-    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
+    fn reinterpret_u32_u64x2(self, a: u64x2<Self>) -> u32x4<Self> {
+        <v128>::from(a).simd_into(self)
     }
     #[inline(always)]
-    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(
-            self.round_ties_even_f32x4(a0),
-            self.round_ties_even_f32x4(a1),
-        )
+    fn splat_mask64x2(self, val: bool) -> mask64x2<Self> {
+        let val: i64 = if val { !0 } else { 0 };
+        i64x2_splat(val).simd_into(self)
     }
     #[inline(always)]
-    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
+    fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
+        mask64x2 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
+    fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
+        crate::transmute::checked_transmute_copy::<v128, [i64; 2usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_f32x8(b);
-        let (c0, c1) = self.split_f32x8(c);
-        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        let bitset = i64x2_splat(bits as i64);
+        let powers = u64x2(1, 2);
+        let selected = v128_and(bitset, powers);
+        i64x2_ne(selected, i64x2_splat(0)).simd_into(self)
     }
     #[inline(always)]
-    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
-        f32x16 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        i64x2_bitmask(a.into()) as u64
     }
     #[inline(always)]
-    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
-        (
-            f32x4 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            f32x4 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn set_mask64x2(self, a: &mut mask64x2<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 2usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2usize
+        );
+        let mut lanes = self.as_array_mask64x2(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x2(lanes);
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_f64x2(
-            self.reinterpret_f64_f32x4(a0),
-            self.reinterpret_f64_f32x4(a1),
-        )
+    fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        v128_and(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(
-            self.reinterpret_i32_f32x4(a0),
-            self.reinterpret_i32_f32x4(a1),
-        )
+    fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        v128_or(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
+    fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        v128_xor(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_f32x4(a0),
-            self.reinterpret_u32_f32x4(a1),
-        )
+    fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
+        v128_not(a.into()).simd_into(self)
     }
     #[inline(always)]
-    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
+    fn select_mask64x2(
+        self,
+        a: mask64x2<Self>,
+        b: mask64x2<Self>,
+        c: mask64x2<Self>,
+    ) -> mask64x2<Self> {
+        #[cfg(target_feature = "relaxed-simd")]
+        {
+            i64x2_relaxed_laneselect(b.into(), c.into(), a.into()).simd_into(self)
+        }
+        #[cfg(not(target_feature = "relaxed-simd"))]
+        {
+            v128_bitselect(b.into(), c.into(), a.into()).simd_into(self)
+        }
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_u32x4(
-            self.cvt_u32_precise_f32x4(a0),
-            self.cvt_u32_precise_f32x4(a1),
-        )
+    fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
+        i64x2_eq(a.into(), b.into()).simd_into(self)
     }
     #[inline(always)]
-    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
+    fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        v128_any_true(a.into())
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_f32x8(a);
-        self.combine_i32x4(
-            self.cvt_i32_precise_f32x4(a0),
-            self.cvt_i32_precise_f32x4(a1),
-        )
+    fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
+        i64x2_all_true(a.into())
     }
     #[inline(always)]
-    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
-        let half = self.splat_i8x16(val);
-        self.combine_i8x16(half, half)
+    fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        !i64x2_all_true(a.into())
     }
     #[inline(always)]
-    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
+        !v128_any_true(a.into())
     }
     #[inline(always)]
-    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
-        i8x32 {
-            val: crate::transmute::checked_transmute_copy(val),
+    fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
+        mask64x4 {
+            val: crate::support::Aligned256([a.val.0, b.val.0]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0)
+    fn splat_f32x8(self, val: f32) -> f32x8<Self> {
+        let half = self.splat_f32x4(val);
+        self.combine_f32x4(half, half)
     }
     #[inline(always)]
-    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [i8; 32usize]>(&a.val.0)
+    fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [i8; 32usize]>(&mut a.val.0)
+    fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
+        f32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
+    fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [f32; 8usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [f32; 8usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
-        i8x32 {
+    fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
+        f32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_i8x32(a).val.0,
-            self.cvt_to_bytes_i8x32(b).val.0,
-            SHIFT,
+            self.cvt_to_bytes_f32x8(a).val.0,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_i8x32(u8x32 {
+        self.cvt_from_bytes_f32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x32<const SHIFT: usize>(
+    fn slide_within_blocks_f32x8<const SHIFT: usize>(
         self,
-        a: i8x32<Self>,
-        b: i8x32<Self>,
-    ) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(
-            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
+        a: f32x8<Self>,
+        b: f32x8<Self>,
+    ) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
-    }
-    #[inline(always)]
-    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
+    fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
     }
     #[inline(always)]
-    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
+    fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
     }
     #[inline(always)]
-    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
+    fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
     }
     #[inline(always)]
-    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
+    fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.approximate_recip_f32x4(a0),
+            self.approximate_recip_f32x4(a1),
+        )
     }
     #[inline(always)]
-    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
+    fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
+    fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
+    fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
+    fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
+    fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
+    fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
+    fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
+    fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
+    fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
+    fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, _) = self.split_i8x32(a);
-        let (b0, _) = self.split_i8x32(b);
-        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
+    fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, _) = self.split_f32x8(a);
+        let (b0, _) = self.split_f32x8(b);
+        self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (_, a1) = self.split_i8x32(a);
-        let (_, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
+    fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (_, a1) = self.split_f32x8(a);
+        let (_, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
+    fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
+    fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let lo_lo = self.zip_low_i8x16(a0, b0);
-        let lo_hi = self.zip_high_i8x16(a0, b0);
-        let hi_lo = self.zip_low_i8x16(a1, b1);
-        let hi_hi = self.zip_high_i8x16(a1, b1);
+    fn interleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let lo_lo = self.zip_low_f32x4(a0, b0);
+        let lo_hi = self.zip_high_f32x4(a0, b0);
+        let hi_lo = self.zip_low_f32x4(a1, b1);
+        let hi_hi = self.zip_high_f32x4(a1, b1);
         (
-            self.combine_i8x16(lo_lo, lo_hi),
-            self.combine_i8x16(hi_lo, hi_hi),
+            self.combine_f32x4(lo_lo, lo_hi),
+            self.combine_f32x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let lo_even = self.unzip_low_i8x16(a0, a1);
-        let lo_odd = self.unzip_high_i8x16(a0, a1);
-        let hi_even = self.unzip_low_i8x16(b0, b1);
-        let hi_odd = self.unzip_high_i8x16(b0, b1);
+    fn deinterleave_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let lo_even = self.unzip_low_f32x4(a0, a1);
+        let lo_odd = self.unzip_high_f32x4(a0, a1);
+        let hi_even = self.unzip_low_f32x4(b0, b1);
+        let hi_odd = self.unzip_high_f32x4(b0, b1);
         (
-            self.combine_i8x16(lo_even, hi_even),
-            self.combine_i8x16(lo_odd, hi_odd),
+            self.combine_f32x4(lo_even, hi_even),
+            self.combine_f32x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        let (c0, c1) = self.split_i8x32(c);
-        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
+    fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
+    fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
     }
     #[inline(always)]
-    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        let (b0, b1) = self.split_i8x32(b);
-        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
+    fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.max_precise_f32x4(a0, b0),
+            self.max_precise_f32x4(a1, b1),
+        )
     }
     #[inline(always)]
-    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
-        i8x64 {
+    fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        self.combine_f32x4(
+            self.min_precise_f32x4(a0, b0),
+            self.min_precise_f32x4(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_add_f32x4(a0, b0, c0),
+            self.mul_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_sub_f32x4(a0, b0, c0),
+            self.mul_sub_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
+    }
+    #[inline(always)]
+    fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(
+            self.round_ties_even_f32x4(a0),
+            self.round_ties_even_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
+    }
+    #[inline(always)]
+    fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
+    }
+    #[inline(always)]
+    fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
+        f32x16 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
+    fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
         (
-            i8x16 {
+            f32x4 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            i8x16 {
+            f32x4 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
+    fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_f64x2(
+            self.reinterpret_f64_f32x4(a0),
+            self.reinterpret_f64_f32x4(a1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i8x32(a);
-        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
+    fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(
+            self.reinterpret_i32_f32x4(a0),
+            self.reinterpret_i32_f32x4(a1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i8x32(a);
+    fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
         self.combine_u32x4(
-            self.reinterpret_u32_i8x16(a0),
-            self.reinterpret_u32_i8x16(a1),
+            self.reinterpret_u32_f32x4(a0),
+            self.reinterpret_u32_f32x4(a1),
         )
     }
     #[inline(always)]
-    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
-        let half = self.splat_u8x16(val);
-        self.combine_u8x16(half, half)
+    fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
     }
     #[inline(always)]
-    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_u32x4(
+            self.cvt_u32_precise_f32x4(a0),
+            self.cvt_u32_precise_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        self.combine_i32x4(
+            self.cvt_i32_precise_f32x4(a0),
+            self.cvt_i32_precise_f32x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_i8x32(self, val: i8) -> i8x32<Self> {
+        let half = self.splat_i8x16(val);
+        self.combine_i8x16(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
-        u8x32 {
+    fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [u8; 32usize]>(&a.val.0)
+    fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [u8; 32usize]>(&a.val.0)
+    fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [u8; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [i8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+    fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        u8x32 {
+    fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
+        i8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+    fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
         if SHIFT >= 32usize {
             return b;
         }
         let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_u8x32(a).val.0,
-            self.cvt_to_bytes_u8x32(b).val.0,
+            self.cvt_to_bytes_i8x32(a).val.0,
+            self.cvt_to_bytes_i8x32(b).val.0,
             SHIFT,
         );
-        self.cvt_from_bytes_u8x32(u8x32 {
+        self.cvt_from_bytes_i8x32(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x32<const SHIFT: usize>(
+    fn slide_within_blocks_i8x32<const SHIFT: usize>(
         self,
-        a: u8x32<Self>,
-        b: u8x32<Self>,
-    ) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(
-            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
+        a: i8x32<Self>,
+        b: i8x32<Self>,
+    ) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(
+            self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
+    fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
+    fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
+    fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
+    fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
+    fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
+    fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
+    fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
     }
     #[inline(always)]
-    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
+    fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
+    fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
+    fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
+    fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
+    fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
+    fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
+    fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
+    fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
+    fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, _) = self.split_u8x32(a);
-        let (b0, _) = self.split_u8x32(b);
-        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
+    fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, _) = self.split_i8x32(a);
+        let (b0, _) = self.split_i8x32(b);
+        self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (_, a1) = self.split_u8x32(a);
-        let (_, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
+    fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (_, a1) = self.split_i8x32(a);
+        let (_, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
+    fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
+    fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let lo_lo = self.zip_low_u8x16(a0, b0);
-        let lo_hi = self.zip_high_u8x16(a0, b0);
-        let hi_lo = self.zip_low_u8x16(a1, b1);
-        let hi_hi = self.zip_high_u8x16(a1, b1);
+    fn interleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        let lo_lo = self.zip_low_i8x16(a0, b0);
+        let lo_hi = self.zip_high_i8x16(a0, b0);
+        let hi_lo = self.zip_low_i8x16(a1, b1);
+        let hi_hi = self.zip_high_i8x16(a1, b1);
         (
-            self.combine_u8x16(lo_lo, lo_hi),
-            self.combine_u8x16(hi_lo, hi_hi),
+            self.combine_i8x16(lo_lo, lo_hi),
+            self.combine_i8x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let lo_even = self.unzip_low_u8x16(a0, a1);
-        let lo_odd = self.unzip_high_u8x16(a0, a1);
-        let hi_even = self.unzip_low_u8x16(b0, b1);
-        let hi_odd = self.unzip_high_u8x16(b0, b1);
+    fn deinterleave_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        let lo_even = self.unzip_low_i8x16(a0, a1);
+        let lo_odd = self.unzip_high_i8x16(a0, a1);
+        let hi_even = self.unzip_low_i8x16(b0, b1);
+        let hi_odd = self.unzip_high_i8x16(b0, b1);
         (
-            self.combine_u8x16(lo_even, hi_even),
-            self.combine_u8x16(lo_odd, hi_odd),
+            self.combine_i8x16(lo_even, hi_even),
+            self.combine_i8x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+    fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        let (c0, c1) = self.split_u8x32(c);
-        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
+        let (b0, b1) = self.split_i8x32(b);
+        let (c0, c1) = self.split_i8x32(c);
+        self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
+    fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        let (b0, b1) = self.split_u8x32(b);
-        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
+    fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        let (b0, b1) = self.split_i8x32(b);
+        self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
     }
     #[inline(always)]
-    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
+        i8x64 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+    fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
         (
-            u8x16 {
+            i8x16 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            u8x16 {
+            i8x16 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u8x32(a);
-        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
+    fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u8x32(a);
+    fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i8x32(a);
+        self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i8x32(a);
         self.combine_u32x4(
-            self.reinterpret_u32_u8x16(a0),
-            self.reinterpret_u32_u8x16(a1),
+            self.reinterpret_u32_i8x16(a0),
+            self.reinterpret_u32_i8x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
-        let half = self.splat_mask8x16(val);
-        self.combine_mask8x16(half, half)
+    fn splat_u8x32(self, val: u8) -> u8x32<Self> {
+        let half = self.splat_u8x16(val);
+        self.combine_u8x16(half, half)
     }
     #[inline(always)]
-    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
-        mask8x32 {
+    fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0)
-    }
-    #[inline(always)]
-    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
-        let lo = self.from_bitmask_mask8x16(bits);
-        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
-        self.combine_mask8x16(lo, hi)
+    fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
-        let (lo, hi) = self.split_mask8x32(a);
-        let lo = self.to_bitmask_mask8x16(lo);
-        let hi = self.to_bitmask_mask8x16(hi);
-        lo | (hi << 16usize)
+    fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 32usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            32usize
-        );
-        let mut lanes = self.as_array_mask8x32(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x32(lanes);
+    fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [u8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
+    fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [u8; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
+    fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
+    fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
+    fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn select_mask8x32(
+    fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        if SHIFT >= 32usize {
+            return b;
+        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_u8x32(a).val.0,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x32<const SHIFT: usize>(
         self,
-        a: mask8x32<Self>,
-        b: mask8x32<Self>,
-        c: mask8x32<Self>,
-    ) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        let (c0, c1) = self.split_mask8x32(c);
-        self.combine_mask8x16(
-            self.select_mask8x16(a0, b0, c0),
-            self.select_mask8x16(a1, b1, c1),
+        a: u8x32<Self>,
+        b: u8x32<Self>,
+    ) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(
+            self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
-        let (a0, a1) = self.split_mask8x32(a);
-        let (b0, b1) = self.split_mask8x32(b);
-        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
+    fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
+    fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
+    fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
+    fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x32(a);
-        self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
+    fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
-        mask8x64 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
-        (
-            mask8x16 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask8x16 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
     }
     #[inline(always)]
-    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
-        let half = self.splat_i16x8(val);
-        self.combine_i16x8(half, half)
+    fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
     }
     #[inline(always)]
-    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
     }
     #[inline(always)]
-    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0)
+    fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [i16; 16usize]>(&a.val.0)
+    fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [i16; 16usize]>(&mut a.val.0)
+    fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
-        i16x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_i16x16(a).val.0,
-            self.cvt_to_bytes_i16x16(b).val.0,
-            SHIFT * 2usize,
-        );
-        self.cvt_from_bytes_i16x16(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, _) = self.split_u8x32(a);
+        let (b0, _) = self.split_u8x32(b);
+        self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x16<const SHIFT: usize>(
-        self,
-        a: i16x16<Self>,
-        b: i16x16<Self>,
-    ) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(
-            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
-        )
+    fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (_, a1) = self.split_u8x32(a);
+        let (_, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
+    fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
     }
     #[inline(always)]
-    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
+    fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
     }
     #[inline(always)]
-    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
+    fn interleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let lo_lo = self.zip_low_u8x16(a0, b0);
+        let lo_hi = self.zip_high_u8x16(a0, b0);
+        let hi_lo = self.zip_low_u8x16(a1, b1);
+        let hi_hi = self.zip_high_u8x16(a1, b1);
+        (
+            self.combine_u8x16(lo_lo, lo_hi),
+            self.combine_u8x16(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
+    fn deinterleave_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let lo_even = self.unzip_low_u8x16(a0, a1);
+        let lo_odd = self.unzip_high_u8x16(a0, a1);
+        let hi_even = self.unzip_low_u8x16(b0, b1);
+        let hi_odd = self.unzip_high_u8x16(b0, b1);
+        (
+            self.combine_u8x16(lo_even, hi_even),
+            self.combine_u8x16(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
+    fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        let (c0, c1) = self.split_u8x32(c);
+        self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
     }
     #[inline(always)]
-    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
+    fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
+    fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        let (b0, b1) = self.split_u8x32(b);
+        self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
     }
     #[inline(always)]
-    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
+    fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
+    fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
+        (
+            u8x16 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            u8x16 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
+    fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
     }
     #[inline(always)]
-    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
+    fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u8x32(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u8x16(a0),
+            self.reinterpret_u32_u8x16(a1),
+        )
     }
     #[inline(always)]
-    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
+    fn splat_mask8x32(self, val: bool) -> mask8x32<Self> {
+        let half = self.splat_mask8x16(val);
+        self.combine_mask8x16(half, half)
     }
     #[inline(always)]
-    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
+    fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
+        mask8x32 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
+    fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i8; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        let lo = self.from_bitmask_mask8x16(bits);
+        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
+        self.combine_mask8x16(lo, hi)
     }
     #[inline(always)]
-    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x32(a);
+        let lo = self.to_bitmask_mask8x16(lo);
+        let hi = self.to_bitmask_mask8x16(hi);
+        lo | (hi << 16usize)
     }
     #[inline(always)]
-    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, _) = self.split_i16x16(a);
-        let (b0, _) = self.split_i16x16(b);
-        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
+    fn set_mask8x32(self, a: &mut mask8x32<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 32usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32usize
+        );
+        let mut lanes = self.as_array_mask8x32(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x32(lanes);
     }
     #[inline(always)]
-    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (_, a1) = self.split_i16x16(a);
-        let (_, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
+    fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
+    fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
+    fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let lo_lo = self.zip_low_i16x8(a0, b0);
-        let lo_hi = self.zip_high_i16x8(a0, b0);
-        let hi_lo = self.zip_low_i16x8(a1, b1);
-        let hi_hi = self.zip_high_i16x8(a1, b1);
-        (
-            self.combine_i16x8(lo_lo, lo_hi),
-            self.combine_i16x8(hi_lo, hi_hi),
-        )
+    fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
     }
     #[inline(always)]
-    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let lo_even = self.unzip_low_i16x8(a0, a1);
-        let lo_odd = self.unzip_high_i16x8(a0, a1);
-        let hi_even = self.unzip_low_i16x8(b0, b1);
-        let hi_odd = self.unzip_high_i16x8(b0, b1);
-        (
-            self.combine_i16x8(lo_even, hi_even),
-            self.combine_i16x8(lo_odd, hi_odd),
+    fn select_mask8x32(
+        self,
+        a: mask8x32<Self>,
+        b: mask8x32<Self>,
+        c: mask8x32<Self>,
+    ) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        let (c0, c1) = self.split_mask8x32(c);
+        self.combine_mask8x16(
+            self.select_mask8x16(a0, b0, c0),
+            self.select_mask8x16(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        let (c0, c1) = self.split_i16x16(c);
-        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
+    fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
+        let (a0, a1) = self.split_mask8x32(a);
+        let (b0, b1) = self.split_mask8x32(b);
+        self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
     }
     #[inline(always)]
-    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
+    fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
     }
     #[inline(always)]
-    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        let (b0, b1) = self.split_i16x16(b);
-        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
+    fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
     }
     #[inline(always)]
-    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x32(a);
+        self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
+    }
+    #[inline(always)]
+    fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
+        mask8x64 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
+    fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
         (
-            i16x8 {
+            mask8x16 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            i16x8 {
+            mask8x16 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
-    }
-    #[inline(always)]
-    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i16x16(a);
-        self.combine_u32x4(
-            self.reinterpret_u32_i16x8(a0),
-            self.reinterpret_u32_i16x8(a1),
-        )
-    }
-    #[inline(always)]
-    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
-        let half = self.splat_u16x8(val);
-        self.combine_u16x8(half, half)
+    fn splat_i16x16(self, val: i16) -> i16x16<Self> {
+        let half = self.splat_i16x8(val);
+        self.combine_i16x8(half, half)
     }
     #[inline(always)]
-    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
-        u16x16 {
+    fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
+        i16x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [u16; 16usize]>(&a.val.0)
+    fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [u16; 16usize]>(&a.val.0)
+    fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [i16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [u16; 16usize]>(&mut a.val.0)
+    fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [i16; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+    fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
-        u16x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
+    fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
+        i16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+    fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
         if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_u16x16(a).val.0,
-            self.cvt_to_bytes_u16x16(b).val.0,
+            self.cvt_to_bytes_i16x16(a).val.0,
+            self.cvt_to_bytes_i16x16(b).val.0,
             SHIFT * 2usize,
         );
-        self.cvt_from_bytes_u16x16(u8x32 {
+        self.cvt_from_bytes_i16x16(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x16<const SHIFT: usize>(
+    fn slide_within_blocks_i16x16<const SHIFT: usize>(
         self,
-        a: u16x16<Self>,
-        b: u16x16<Self>,
-    ) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(
-            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
+        a: i16x16<Self>,
+        b: i16x16<Self>,
+    ) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(
+            self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
+    fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
+    fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
+    fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
+    fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
+    fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
+    fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
+    fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
     }
     #[inline(always)]
-    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
+    fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
+    fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
+    fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
+    fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
+    fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
+    fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
+    fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
+    fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
+    fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, _) = self.split_u16x16(a);
-        let (b0, _) = self.split_u16x16(b);
-        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
+    fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, _) = self.split_i16x16(a);
+        let (b0, _) = self.split_i16x16(b);
+        self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (_, a1) = self.split_u16x16(a);
-        let (_, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
+    fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (_, a1) = self.split_i16x16(a);
+        let (_, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
+    fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
+    fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let lo_lo = self.zip_low_u16x8(a0, b0);
-        let lo_hi = self.zip_high_u16x8(a0, b0);
-        let hi_lo = self.zip_low_u16x8(a1, b1);
-        let hi_hi = self.zip_high_u16x8(a1, b1);
+    fn interleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        let lo_lo = self.zip_low_i16x8(a0, b0);
+        let lo_hi = self.zip_high_i16x8(a0, b0);
+        let hi_lo = self.zip_low_i16x8(a1, b1);
+        let hi_hi = self.zip_high_i16x8(a1, b1);
         (
-            self.combine_u16x8(lo_lo, lo_hi),
-            self.combine_u16x8(hi_lo, hi_hi),
+            self.combine_i16x8(lo_lo, lo_hi),
+            self.combine_i16x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let lo_even = self.unzip_low_u16x8(a0, a1);
-        let lo_odd = self.unzip_high_u16x8(a0, a1);
-        let hi_even = self.unzip_low_u16x8(b0, b1);
-        let hi_odd = self.unzip_high_u16x8(b0, b1);
+    fn deinterleave_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> (i16x16<Self>, i16x16<Self>) {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        let lo_even = self.unzip_low_i16x8(a0, a1);
+        let lo_odd = self.unzip_high_i16x8(a0, a1);
+        let hi_even = self.unzip_low_i16x8(b0, b1);
+        let hi_odd = self.unzip_high_i16x8(b0, b1);
         (
-            self.combine_u16x8(lo_even, hi_even),
-            self.combine_u16x8(lo_odd, hi_odd),
+            self.combine_i16x8(lo_even, hi_even),
+            self.combine_i16x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+    fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        let (c0, c1) = self.split_u16x16(c);
-        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
+        let (b0, b1) = self.split_i16x16(b);
+        let (c0, c1) = self.split_i16x16(c);
+        self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
+    fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        let (b0, b1) = self.split_u16x16(b);
-        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
+    fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        let (b0, b1) = self.split_i16x16(b);
+        self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
     }
     #[inline(always)]
-    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
+        i16x32 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
+    fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
         (
-            u16x8 {
+            i16x8 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            u16x8 {
+            i16x8 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
-        let mask = u16x8_splat(0xFF);
-        let (low, high) = self.split_u16x16(a);
-        let low_masked = v128_and(low.into(), mask);
-        let high_masked = v128_and(high.into(), mask);
-        let result = u8x16_narrow_i16x8(low_masked, high_masked);
-        result.simd_into(self)
+    fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u16x16(a);
-        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
+    fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i16x16(a);
+        self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u16x16(a);
+    fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i16x16(a);
         self.combine_u32x4(
-            self.reinterpret_u32_u16x8(a0),
-            self.reinterpret_u32_u16x8(a1),
+            self.reinterpret_u32_i16x8(a0),
+            self.reinterpret_u32_i16x8(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
-        let half = self.splat_mask16x8(val);
-        self.combine_mask16x8(half, half)
+    fn splat_u16x16(self, val: u16) -> u16x16<Self> {
+        let half = self.splat_u16x8(val);
+        self.combine_u16x8(half, half)
     }
     #[inline(always)]
-    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
-        mask16x16 {
+    fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0)
+    fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
-        let lo = self.from_bitmask_mask16x8(bits);
-        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
-        self.combine_mask16x8(lo, hi)
+    fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
-        let (lo, hi) = self.split_mask16x16(a);
-        let lo = self.to_bitmask_mask16x8(lo);
-        let hi = self.to_bitmask_mask16x8(hi);
-        lo | (hi << 8usize)
+    fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [u16; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let mut lanes = self.as_array_mask16x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x16(lanes);
+    fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [u16; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
+    fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
+    fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
+        u16x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
+    fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
+    fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_u16x16(a).val.0,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask16x16(
+    fn slide_within_blocks_u16x16<const SHIFT: usize>(
         self,
-        a: mask16x16<Self>,
-        b: mask16x16<Self>,
-        c: mask16x16<Self>,
-    ) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        let (c0, c1) = self.split_mask16x16(c);
-        self.combine_mask16x8(
-            self.select_mask16x8(a0, b0, c0),
-            self.select_mask16x8(a1, b1, c1),
+        a: u16x16<Self>,
+        b: u16x16<Self>,
+    ) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(
+            self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
-        let (a0, a1) = self.split_mask16x16(a);
-        let (b0, b1) = self.split_mask16x16(b);
-        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
+    fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
+    fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
+    fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
+    fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x16(a);
-        self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
+    fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
-        mask16x32 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
-        (
-            mask16x8 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask16x8 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
     }
     #[inline(always)]
-    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
-        let half = self.splat_i32x4(val);
-        self.combine_i32x4(half, half)
+    fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
     }
     #[inline(always)]
-    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
-        i32x8 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
     }
     #[inline(always)]
-    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0)
+    fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [i32; 8usize]>(&a.val.0)
+    fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [i32; 8usize]>(&mut a.val.0)
+    fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
-        i32x8 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        if SHIFT >= 8usize {
-            return b;
-        }
-        let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_i32x8(a).val.0,
-            self.cvt_to_bytes_i32x8(b).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_i32x8(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
-    }
-    #[inline(always)]
-    fn slide_within_blocks_i32x8<const SHIFT: usize>(
-        self,
-        a: i32x8<Self>,
-        b: i32x8<Self>,
-    ) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(
-            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
-    }
-    #[inline(always)]
-    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
-    }
-    #[inline(always)]
-    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
-    }
-    #[inline(always)]
-    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, _) = self.split_i32x8(a);
-        let (b0, _) = self.split_i32x8(b);
-        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
+    fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, _) = self.split_u16x16(a);
+        let (b0, _) = self.split_u16x16(b);
+        self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (_, a1) = self.split_i32x8(a);
-        let (_, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
+    fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (_, a1) = self.split_u16x16(a);
+        let (_, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
+    fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
+    fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let lo_lo = self.zip_low_i32x4(a0, b0);
-        let lo_hi = self.zip_high_i32x4(a0, b0);
-        let hi_lo = self.zip_low_i32x4(a1, b1);
-        let hi_hi = self.zip_high_i32x4(a1, b1);
+    fn interleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let lo_lo = self.zip_low_u16x8(a0, b0);
+        let lo_hi = self.zip_high_u16x8(a0, b0);
+        let hi_lo = self.zip_low_u16x8(a1, b1);
+        let hi_hi = self.zip_high_u16x8(a1, b1);
         (
-            self.combine_i32x4(lo_lo, lo_hi),
-            self.combine_i32x4(hi_lo, hi_hi),
+            self.combine_u16x8(lo_lo, lo_hi),
+            self.combine_u16x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let lo_even = self.unzip_low_i32x4(a0, a1);
-        let lo_odd = self.unzip_high_i32x4(a0, a1);
-        let hi_even = self.unzip_low_i32x4(b0, b1);
-        let hi_odd = self.unzip_high_i32x4(b0, b1);
+    fn deinterleave_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> (u16x16<Self>, u16x16<Self>) {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let lo_even = self.unzip_low_u16x8(a0, a1);
+        let lo_odd = self.unzip_high_u16x8(a0, a1);
+        let hi_even = self.unzip_low_u16x8(b0, b1);
+        let hi_odd = self.unzip_high_u16x8(b0, b1);
         (
-            self.combine_i32x4(lo_even, hi_even),
-            self.combine_i32x4(lo_odd, hi_odd),
+            self.combine_u16x8(lo_even, hi_even),
+            self.combine_u16x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        let (c0, c1) = self.split_i32x8(c);
-        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
+    fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        let (c0, c1) = self.split_u16x16(c);
+        self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
+    fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        let (b0, b1) = self.split_i32x8(b);
-        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
+    fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        let (b0, b1) = self.split_u16x16(b);
+        self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
     }
     #[inline(always)]
-    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
-        i32x16 {
+    fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
+        u16x32 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
+    fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
         (
-            i32x4 {
+            u16x8 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            i32x4 {
+            u16x8 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
+    fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
+        let mask = u16x8_splat(0xFF);
+        let (low, high) = self.split_u16x16(a);
+        let low_masked = v128_and(low.into(), mask);
+        let high_masked = v128_and(high.into(), mask);
+        let result = u8x16_narrow_i16x8(low_masked, high_masked);
+        result.simd_into(self)
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
+    fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u16x16(a);
+        self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
+    fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u16x16(a);
         self.combine_u32x4(
-            self.reinterpret_u32_i32x4(a0),
-            self.reinterpret_u32_i32x4(a1),
+            self.reinterpret_u32_u16x8(a0),
+            self.reinterpret_u32_u16x8(a1),
         )
     }
     #[inline(always)]
-    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_i32x8(a);
-        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
-    }
-    #[inline(always)]
-    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
-        let half = self.splat_u32x4(val);
-        self.combine_u32x4(half, half)
+    fn splat_mask16x16(self, val: bool) -> mask16x16<Self> {
+        let half = self.splat_mask16x8(val);
+        self.combine_mask16x8(half, half)
     }
     #[inline(always)]
-    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
+        mask16x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
-        u32x8 {
+    fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i16; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        let lo = self.from_bitmask_mask16x8(bits);
+        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
+        self.combine_mask16x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x16(a);
+        let lo = self.to_bitmask_mask16x8(lo);
+        let hi = self.to_bitmask_mask16x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
+    fn set_mask16x16(self, a: &mut mask16x16<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 16usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16usize
+        );
+        let mut lanes = self.as_array_mask16x16(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
+    fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
+    }
+    #[inline(always)]
+    fn select_mask16x16(
+        self,
+        a: mask16x16<Self>,
+        b: mask16x16<Self>,
+        c: mask16x16<Self>,
+    ) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        let (c0, c1) = self.split_mask16x16(c);
+        self.combine_mask16x8(
+            self.select_mask16x8(a0, b0, c0),
+            self.select_mask16x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
+        let (a0, a1) = self.split_mask16x16(a);
+        let (b0, b1) = self.split_mask16x16(b);
+        self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
+    }
+    #[inline(always)]
+    fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x16(a);
+        self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
+    }
+    #[inline(always)]
+    fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
+        mask16x32 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
+        (
+            mask16x8 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            mask16x8 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_i32x8(self, val: i32) -> i32x8<Self> {
+        let half = self.splat_i32x4(val);
+        self.combine_i32x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [u32; 8usize]>(&a.val.0)
+    fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [u32; 8usize]>(&a.val.0)
+    fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [u32; 8usize]>(&mut a.val.0)
+    fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [i32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+    fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
-        u32x8 {
+    fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
+        i32x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+    fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
         u8x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+    fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_u32x8(a).val.0,
-            self.cvt_to_bytes_u32x8(b).val.0,
+            self.cvt_to_bytes_i32x8(a).val.0,
+            self.cvt_to_bytes_i32x8(b).val.0,
             SHIFT * 4usize,
         );
-        self.cvt_from_bytes_u32x8(u8x32 {
+        self.cvt_from_bytes_i32x8(u8x32 {
             val: crate::support::Aligned256(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x8<const SHIFT: usize>(
+    fn slide_within_blocks_i32x8<const SHIFT: usize>(
         self,
-        a: u32x8<Self>,
-        b: u32x8<Self>,
-    ) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(
-            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
+        a: i32x8<Self>,
+        b: i32x8<Self>,
+    ) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(
+            self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
+    fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
+    fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
+    fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
+    fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
+    fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
+    fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
+    fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
     }
     #[inline(always)]
-    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
+    fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
+    fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
+    fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
+    fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
+    fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
+    fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
+    fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
+    fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
+    fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, _) = self.split_u32x8(a);
-        let (b0, _) = self.split_u32x8(b);
-        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
+    fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, _) = self.split_i32x8(a);
+        let (b0, _) = self.split_i32x8(b);
+        self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (_, a1) = self.split_u32x8(a);
-        let (_, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
+    fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (_, a1) = self.split_i32x8(a);
+        let (_, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
+    fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
+    fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let lo_lo = self.zip_low_u32x4(a0, b0);
-        let lo_hi = self.zip_high_u32x4(a0, b0);
-        let hi_lo = self.zip_low_u32x4(a1, b1);
-        let hi_hi = self.zip_high_u32x4(a1, b1);
+    fn interleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        let lo_lo = self.zip_low_i32x4(a0, b0);
+        let lo_hi = self.zip_high_i32x4(a0, b0);
+        let hi_lo = self.zip_low_i32x4(a1, b1);
+        let hi_hi = self.zip_high_i32x4(a1, b1);
         (
-            self.combine_u32x4(lo_lo, lo_hi),
-            self.combine_u32x4(hi_lo, hi_hi),
+            self.combine_i32x4(lo_lo, lo_hi),
+            self.combine_i32x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let lo_even = self.unzip_low_u32x4(a0, a1);
-        let lo_odd = self.unzip_high_u32x4(a0, a1);
-        let hi_even = self.unzip_low_u32x4(b0, b1);
-        let hi_odd = self.unzip_high_u32x4(b0, b1);
+    fn deinterleave_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> (i32x8<Self>, i32x8<Self>) {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        let lo_even = self.unzip_low_i32x4(a0, a1);
+        let lo_odd = self.unzip_high_i32x4(a0, a1);
+        let hi_even = self.unzip_low_i32x4(b0, b1);
+        let hi_odd = self.unzip_high_i32x4(b0, b1);
         (
-            self.combine_u32x4(lo_even, hi_even),
-            self.combine_u32x4(lo_odd, hi_odd),
+            self.combine_i32x4(lo_even, hi_even),
+            self.combine_i32x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+    fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        let (c0, c1) = self.split_u32x8(c);
-        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
+        let (b0, b1) = self.split_i32x8(b);
+        let (c0, c1) = self.split_i32x8(c);
+        self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
+    fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        let (b0, b1) = self.split_u32x8(b);
-        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
+    fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        let (b0, b1) = self.split_i32x8(b);
+        self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
     }
     #[inline(always)]
-    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
-        u32x16 {
+    fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
+        i32x16 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+    fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
         (
-            u32x4 {
+            i32x4 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            u32x4 {
+            i32x4 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
+    fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
     }
     #[inline(always)]
-    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_u32x8(a);
-        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
+    fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
     }
     #[inline(always)]
-    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
-        let half = self.splat_mask32x4(val);
-        self.combine_mask32x4(half, half)
+    fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_i32x4(a0),
+            self.reinterpret_u32_i32x4(a1),
+        )
     }
     #[inline(always)]
-    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
-        mask32x8 {
+    fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_i32x8(a);
+        self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
+    }
+    #[inline(always)]
+    fn splat_u32x8(self, val: u32) -> u32x8<Self> {
+        let half = self.splat_u32x4(val);
+        self.combine_u32x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0)
+    fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
-        let lo = self.from_bitmask_mask32x4(bits);
-        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
-        self.combine_mask32x4(lo, hi)
+    fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
-        let (lo, hi) = self.split_mask32x8(a);
-        let lo = self.to_bitmask_mask32x4(lo);
-        let hi = self.to_bitmask_mask32x4(hi);
-        lo | (hi << 4usize)
+    fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [u32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 8usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            8usize
-        );
-        let mut lanes = self.as_array_mask32x8(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x8(lanes);
+    fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [u32; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
+    fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
+    fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
+        u32x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
+    fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
+    fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_u32x8(a).val.0,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask32x8(
+    fn slide_within_blocks_u32x8<const SHIFT: usize>(
         self,
-        a: mask32x8<Self>,
-        b: mask32x8<Self>,
-        c: mask32x8<Self>,
-    ) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        let (c0, c1) = self.split_mask32x8(c);
-        self.combine_mask32x4(
-            self.select_mask32x4(a0, b0, c0),
-            self.select_mask32x4(a1, b1, c1),
+        a: u32x8<Self>,
+        b: u32x8<Self>,
+    ) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(
+            self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
-        let (a0, a1) = self.split_mask32x8(a);
-        let (b0, b1) = self.split_mask32x8(b);
-        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
-    }
-    #[inline(always)]
-    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
+    fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
+    fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
+    fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x8(a);
-        self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
+    fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
-        (
-            mask32x4 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask32x4 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
-        let half = self.splat_f64x2(val);
-        self.combine_f64x2(half, half)
+    fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
     }
     #[inline(always)]
-    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
     }
     #[inline(always)]
-    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [f64; 4usize]>(&a.val.0)
+    fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
     }
     #[inline(always)]
-    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
-        crate::transmute::checked_cast_ref::<[v128; 2usize], [f64; 4usize]>(&a.val.0)
+    fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
-        crate::transmute::checked_cast_mut::<[v128; 2usize], [f64; 4usize]>(&mut a.val.0)
+    fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
-        f64x4 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
-        u8x32 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        if SHIFT >= 4usize {
-            return b;
-        }
-        let result = cross_block_slide_128x2(
-            self.cvt_to_bytes_f64x4(a).val.0,
-            self.cvt_to_bytes_f64x4(b).val.0,
-            SHIFT * 8usize,
-        );
-        self.cvt_from_bytes_f64x4(u8x32 {
-            val: crate::support::Aligned256(result),
-            simd: self,
-        })
+    fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x4<const SHIFT: usize>(
-        self,
-        a: f64x4<Self>,
-        b: f64x4<Self>,
-    ) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
-        )
+    fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, _) = self.split_u32x8(a);
+        let (b0, _) = self.split_u32x8(b);
+        self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
     }
     #[inline(always)]
-    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
+    fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (_, a1) = self.split_u32x8(a);
+        let (_, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
+    fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
     }
     #[inline(always)]
-    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
+    fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
     }
     #[inline(always)]
-    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(
-            self.approximate_recip_f64x2(a0),
-            self.approximate_recip_f64x2(a1),
+    fn interleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let lo_lo = self.zip_low_u32x4(a0, b0);
+        let lo_hi = self.zip_high_u32x4(a0, b0);
+        let hi_lo = self.zip_low_u32x4(a1, b1);
+        let hi_hi = self.zip_high_u32x4(a1, b1);
+        (
+            self.combine_u32x4(lo_lo, lo_hi),
+            self.combine_u32x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
+    fn deinterleave_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> (u32x8<Self>, u32x8<Self>) {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let lo_even = self.unzip_low_u32x4(a0, a1);
+        let lo_odd = self.unzip_high_u32x4(a0, a1);
+        let hi_even = self.unzip_low_u32x4(b0, b1);
+        let hi_odd = self.unzip_high_u32x4(b0, b1);
+        (
+            self.combine_u32x4(lo_even, hi_even),
+            self.combine_u32x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
+    fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        let (c0, c1) = self.split_u32x8(c);
+        self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
+    fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
+    fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        let (b0, b1) = self.split_u32x8(b);
+        self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
     }
     #[inline(always)]
-    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
+    fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
+        u32x16 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
+    fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
+        (
+            u32x4 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            u32x4 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
+    fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
     }
     #[inline(always)]
-    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, _) = self.split_f64x4(a);
-        let (b0, _) = self.split_f64x4(b);
-        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
+    fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_u32x8(a);
+        self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
     }
     #[inline(always)]
-    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (_, a1) = self.split_f64x4(a);
-        let (_, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
+    fn splat_mask32x8(self, val: bool) -> mask32x8<Self> {
+        let half = self.splat_mask32x4(val);
+        self.combine_mask32x4(half, half)
     }
     #[inline(always)]
-    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
+    fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
+        mask32x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
+    fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i32; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let lo_lo = self.zip_low_f64x2(a0, b0);
-        let lo_hi = self.zip_high_f64x2(a0, b0);
-        let hi_lo = self.zip_low_f64x2(a1, b1);
-        let hi_hi = self.zip_high_f64x2(a1, b1);
-        (
-            self.combine_f64x2(lo_lo, lo_hi),
-            self.combine_f64x2(hi_lo, hi_hi),
-        )
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        let lo = self.from_bitmask_mask32x4(bits);
+        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
+        self.combine_mask32x4(lo, hi)
     }
     #[inline(always)]
-    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let lo_even = self.unzip_low_f64x2(a0, a1);
-        let lo_odd = self.unzip_high_f64x2(a0, a1);
-        let hi_even = self.unzip_low_f64x2(b0, b1);
-        let hi_odd = self.unzip_high_f64x2(b0, b1);
-        (
-            self.combine_f64x2(lo_even, hi_even),
-            self.combine_f64x2(lo_odd, hi_odd),
-        )
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x8(a);
+        let lo = self.to_bitmask_mask32x4(lo);
+        let hi = self.to_bitmask_mask32x4(hi);
+        lo | (hi << 4usize)
     }
     #[inline(always)]
-    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
+    fn set_mask32x8(self, a: &mut mask32x8<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 8usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8usize
+        );
+        let mut lanes = self.as_array_mask32x8(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask32x8(lanes);
     }
     #[inline(always)]
-    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
+    fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.max_precise_f64x2(a0, b0),
-            self.max_precise_f64x2(a1, b1),
-        )
+    fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        self.combine_f64x2(
-            self.min_precise_f64x2(a0, b0),
-            self.min_precise_f64x2(a1, b1),
-        )
+    fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(
-            self.mul_add_f64x2(a0, b0, c0),
-            self.mul_add_f64x2(a1, b1, c1),
-        )
+    fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
     }
     #[inline(always)]
-    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(
-            self.mul_sub_f64x2(a0, b0, c0),
-            self.mul_sub_f64x2(a1, b1, c1),
+    fn select_mask32x8(
+        self,
+        a: mask32x8<Self>,
+        b: mask32x8<Self>,
+        c: mask32x8<Self>,
+    ) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        let (c0, c1) = self.split_mask32x8(c);
+        self.combine_mask32x4(
+            self.select_mask32x4(a0, b0, c0),
+            self.select_mask32x4(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
-    }
-    #[inline(always)]
-    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
+    fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
+        let (a0, a1) = self.split_mask32x8(a);
+        let (b0, b1) = self.split_mask32x8(b);
+        self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
     }
     #[inline(always)]
-    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(
-            self.round_ties_even_f64x2(a0),
-            self.round_ties_even_f64x2(a1),
-        )
+    fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
     }
     #[inline(always)]
-    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
+    fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
     }
     #[inline(always)]
-    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
+    fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
     }
     #[inline(always)]
-    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_f64x4(b);
-        let (c0, c1) = self.split_f64x4(c);
-        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
+    fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x8(a);
+        self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
     }
     #[inline(always)]
-    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
+        mask32x16 {
             val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
             simd: self,
         }
     }
     #[inline(always)]
-    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
+    fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
         (
-            f64x2 {
+            mask32x4 {
                 val: crate::support::Aligned128(a.val.0[0]),
                 simd: self,
             },
-            f64x2 {
+            mask32x4 {
                 val: crate::support::Aligned128(a.val.0[1]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
-        let (a0, a1) = self.split_f64x4(a);
-        self.combine_f32x4(
-            self.reinterpret_f32_f64x2(a0),
-            self.reinterpret_f32_f64x2(a1),
-        )
+    fn splat_f64x4(self, val: f64) -> f64x4<Self> {
+        let half = self.splat_f64x2(val);
+        self.combine_f64x2(half, half)
     }
     #[inline(always)]
-    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
-        let half = self.splat_mask64x2(val);
-        self.combine_mask64x2(half, half)
+    fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
-        mask64x4 {
-            val: crate::transmute::checked_transmute_copy(&val),
+    fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i64; 4usize]>(&a.val.0)
+    fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
-        let lo = self.from_bitmask_mask64x2(bits);
-        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
-        self.combine_mask64x2(lo, hi)
+    fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [f64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
-        let (lo, hi) = self.split_mask64x4(a);
-        let lo = self.to_bitmask_mask64x2(lo);
-        let hi = self.to_bitmask_mask64x2(hi);
-        lo | (hi << 2usize)
-    }
-    #[inline(always)]
-    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 4usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            4usize
-        );
-        let mut lanes = self.as_array_mask64x4(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask64x4(lanes);
+    fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [f64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
+    fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
+    fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
+        f64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
+    fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
+    fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_f64x4(a).val.0,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn select_mask64x4(
+    fn slide_within_blocks_f64x4<const SHIFT: usize>(
         self,
-        a: mask64x4<Self>,
-        b: mask64x4<Self>,
-        c: mask64x4<Self>,
-    ) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        let (c0, c1) = self.split_mask64x4(c);
-        self.combine_mask64x2(
-            self.select_mask64x2(a0, b0, c0),
-            self.select_mask64x2(a1, b1, c1),
+        a: f64x4<Self>,
+        b: f64x4<Self>,
+    ) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
-        let (a0, a1) = self.split_mask64x4(a);
-        let (b0, b1) = self.split_mask64x4(b);
-        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
-    }
-    #[inline(always)]
-    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
-    }
-    #[inline(always)]
-    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
+    fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
     }
     #[inline(always)]
-    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
+    fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
     }
     #[inline(always)]
-    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
-        let (a0, a1) = self.split_mask64x4(a);
-        self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
+    fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
     }
     #[inline(always)]
-    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
-        mask64x8 {
-            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
-            simd: self,
-        }
+    fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.approximate_recip_f64x2(a0),
+            self.approximate_recip_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
-        (
-            mask64x2 {
-                val: crate::support::Aligned128(a.val.0[0]),
-                simd: self,
-            },
-            mask64x2 {
-                val: crate::support::Aligned128(a.val.0[1]),
-                simd: self,
-            },
-        )
+    fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
-        let half = self.splat_f32x8(val);
-        self.combine_f32x8(half, half)
+    fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [f32; 16usize]>(&a.val.0)
+    fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [f32; 16usize]>(&a.val.0)
+    fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [f32; 16usize]>(&mut a.val.0)
+    fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
-        f32x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        u8x64 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_f32x16(a).val.0,
-            self.cvt_to_bytes_f32x16(b).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_f32x16(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
+    fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, _) = self.split_f64x4(a);
+        let (b0, _) = self.split_f64x4(b);
+        self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
     }
     #[inline(always)]
-    fn slide_within_blocks_f32x16<const SHIFT: usize>(
-        self,
-        a: f32x16<Self>,
-        b: f32x16<Self>,
-    ) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
-        )
+    fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (_, a1) = self.split_f64x4(a);
+        let (_, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
+    fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
     }
     #[inline(always)]
-    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
+    fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
     }
     #[inline(always)]
-    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
+    fn interleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let lo_lo = self.zip_low_f64x2(a0, b0);
+        let lo_hi = self.zip_high_f64x2(a0, b0);
+        let hi_lo = self.zip_low_f64x2(a1, b1);
+        let hi_hi = self.zip_high_f64x2(a1, b1);
+        (
+            self.combine_f64x2(lo_lo, lo_hi),
+            self.combine_f64x2(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.approximate_recip_f32x8(a0),
-            self.approximate_recip_f32x8(a1),
+    fn deinterleave_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let lo_even = self.unzip_low_f64x2(a0, a1);
+        let lo_odd = self.unzip_high_f64x2(a0, a1);
+        let hi_even = self.unzip_low_f64x2(b0, b1);
+        let hi_odd = self.unzip_high_f64x2(b0, b1);
+        (
+            self.combine_f64x2(lo_even, hi_even),
+            self.combine_f64x2(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
+    fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
+    fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
     }
     #[inline(always)]
-    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
+    fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.max_precise_f64x2(a0, b0),
+            self.max_precise_f64x2(a1, b1),
+        )
     }
     #[inline(always)]
-    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
+    fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        self.combine_f64x2(
+            self.min_precise_f64x2(a0, b0),
+            self.min_precise_f64x2(a1, b1),
+        )
     }
     #[inline(always)]
-    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
+    fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_add_f64x2(a0, b0, c0),
+            self.mul_add_f64x2(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
+    fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_sub_f64x2(a0, b0, c0),
+            self.mul_sub_f64x2(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
+    fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
+    fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
     }
     #[inline(always)]
-    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
+    fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(
+            self.round_ties_even_f64x2(a0),
+            self.round_ties_even_f64x2(a1),
+        )
     }
     #[inline(always)]
-    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, _) = self.split_f32x16(a);
-        let (b0, _) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
+    fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
     }
     #[inline(always)]
-    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (_, a1) = self.split_f32x16(a);
-        let (_, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
+    fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
     }
     #[inline(always)]
-    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
+    fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
     }
     #[inline(always)]
-    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
+    fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
+        f64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_lo = self.zip_low_f32x8(a0, b0);
-        let lo_hi = self.zip_high_f32x8(a0, b0);
-        let hi_lo = self.zip_low_f32x8(a1, b1);
-        let hi_hi = self.zip_high_f32x8(a1, b1);
+    fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
         (
-            self.combine_f32x8(lo_lo, lo_hi),
-            self.combine_f32x8(hi_lo, hi_hi),
+            f64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            f64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let lo_even = self.unzip_low_f32x8(a0, a1);
-        let lo_odd = self.unzip_high_f32x8(a0, a1);
-        let hi_even = self.unzip_low_f32x8(b0, b1);
-        let hi_odd = self.unzip_high_f32x8(b0, b1);
-        (
-            self.combine_f32x8(lo_even, hi_even),
-            self.combine_f32x8(lo_odd, hi_odd),
+    fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        self.combine_f32x4(
+            self.reinterpret_f32_f64x2(a0),
+            self.reinterpret_f32_f64x2(a1),
         )
     }
     #[inline(always)]
-    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
+    fn splat_i64x4(self, val: i64) -> i64x4<Self> {
+        let half = self.splat_i64x2(val);
+        self.combine_i64x2(half, half)
     }
     #[inline(always)]
-    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
+    fn load_array_i64x4(self, val: [i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.max_precise_f32x8(a0, b0),
-            self.max_precise_f32x8(a1, b1),
-        )
+    fn load_array_ref_i64x4(self, val: &[i64; 4usize]) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        self.combine_f32x8(
-            self.min_precise_f32x8(a0, b0),
-            self.min_precise_f32x8(a1, b1),
-        )
+    fn as_array_i64x4(self, a: i64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_add_f32x8(a0, b0, c0),
-            self.mul_add_f32x8(a1, b1, c1),
-        )
+    fn as_array_ref_i64x4(self, a: &i64x4<Self>) -> &[i64; 4usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [i64; 4usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        let (b0, b1) = self.split_f32x16(b);
-        let (c0, c1) = self.split_f32x16(c);
-        self.combine_f32x8(
-            self.mul_sub_f32x8(a0, b0, c0),
-            self.mul_sub_f32x8(a1, b1, c1),
-        )
+    fn as_array_mut_i64x4(self, a: &mut i64x4<Self>) -> &mut [i64; 4usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [i64; 4usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+    fn store_array_i64x4(self, a: i64x4<Self>, dest: &mut [i64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+    fn cvt_from_bytes_i64x4(self, a: u8x32<Self>) -> i64x4<Self> {
+        i64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(
-            self.round_ties_even_f32x8(a0),
-            self.round_ties_even_f32x8(a1),
+    fn cvt_to_bytes_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i64x4<const SHIFT: usize>(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_i64x4(a).val.0,
+            self.cvt_to_bytes_i64x4(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i64x4<const SHIFT: usize>(
+        self,
+        a: i64x4<Self>,
+        b: i64x4<Self>,
+    ) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(
+            self.slide_within_blocks_i64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x2::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
+    fn add_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.add_i64x2(a0, b0), self.add_i64x2(a1, b1))
     }
     #[inline(always)]
-    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
+    fn sub_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.sub_i64x2(a0, b0), self.sub_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.mul_i64x2(a0, b0), self.mul_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.and_i64x2(a0, b0), self.and_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.or_i64x2(a0, b0), self.or_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.xor_i64x2(a0, b0), self.xor_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.not_i64x2(a0), self.not_i64x2(a1))
+    }
+    #[inline(always)]
+    fn shl_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shl_i64x2(a0, shift), self.shl_i64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shlv_i64x2(a0, b0), self.shlv_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i64x4(self, a: i64x4<Self>, shift: u32) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.shr_i64x2(a0, shift), self.shr_i64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.shrv_i64x2(a0, b0), self.shrv_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_eq_i64x2(a0, b0), self.simd_eq_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_lt_i64x2(a0, b0), self.simd_lt_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_le_i64x2(a0, b0), self.simd_le_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_ge_i64x2(a0, b0), self.simd_ge_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_mask64x2(self.simd_gt_i64x2(a0, b0), self.simd_gt_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, _) = self.split_i64x4(a);
+        let (b0, _) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a0, b0), self.zip_high_i64x2(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (_, a1) = self.split_i64x4(a);
+        let (_, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.zip_low_i64x2(a1, b1), self.zip_high_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_low_i64x2(a0, a1), self.unzip_low_i64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.unzip_high_i64x2(a0, a1), self.unzip_high_i64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_lo = self.zip_low_i64x2(a0, b0);
+        let lo_hi = self.zip_high_i64x2(a0, b0);
+        let hi_lo = self.zip_low_i64x2(a1, b1);
+        let hi_hi = self.zip_high_i64x2(a1, b1);
+        (
+            self.combine_i64x2(lo_lo, lo_hi),
+            self.combine_i64x2(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> (i64x4<Self>, i64x4<Self>) {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let lo_even = self.unzip_low_i64x2(a0, a1);
+        let lo_odd = self.unzip_high_i64x2(a0, a1);
+        let hi_even = self.unzip_low_i64x2(b0, b1);
+        let hi_odd = self.unzip_high_i64x2(b0, b1);
+        (
+            self.combine_i64x2(lo_even, hi_even),
+            self.combine_i64x2(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i64x4(self, a: mask64x4<Self>, b: i64x4<Self>, c: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        let (c0, c1) = self.split_i64x4(c);
+        self.combine_i64x2(self.select_i64x2(a0, b0, c0), self.select_i64x2(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.min_i64x2(a0, b0), self.min_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        let (b0, b1) = self.split_i64x4(b);
+        self.combine_i64x2(self.max_i64x2(a0, b0), self.max_i64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_i64x4(self, a: i64x4<Self>, b: i64x4<Self>) -> i64x8<Self> {
+        i64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_i64x4(self, a: i64x4<Self>) -> (i64x2<Self>, i64x2<Self>) {
+        (
+            i64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            i64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn neg_i64x4(self, a: i64x4<Self>) -> i64x4<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_i64x2(self.neg_i64x2(a0), self.neg_i64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i64x4(self, a: i64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_i64x2(a0), self.reinterpret_u8_i64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x4(self, a: i64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_i64x4(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_i64x2(a0),
+            self.reinterpret_u32_i64x2(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u64x4(self, val: u64) -> u64x4<Self> {
+        let half = self.splat_u64x2(val);
+        self.combine_u64x2(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u64x4(self, val: [u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u64x4(self, val: &[u64; 4usize]) -> u64x4<Self> {
+        u64x4 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u64x4(self, a: u64x4<Self>) -> [u64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [u64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u64x4(self, a: &u64x4<Self>) -> &[u64; 4usize] {
+        crate::transmute::checked_cast_ref::<[v128; 2usize], [u64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u64x4(self, a: &mut u64x4<Self>) -> &mut [u64; 4usize] {
+        crate::transmute::checked_cast_mut::<[v128; 2usize], [u64; 4usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u64x4(self, a: u64x4<Self>, dest: &mut [u64; 4usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u64x4(self, a: u8x32<Self>) -> u64x4<Self> {
+        u64x4 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        u8x32 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u64x4<const SHIFT: usize>(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        if SHIFT >= 4usize {
+            return b;
+        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_u64x4(a).val.0,
+            self.cvt_to_bytes_u64x4(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_u64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u64x4<const SHIFT: usize>(
+        self,
+        a: u64x4<Self>,
+        b: u64x4<Self>,
+    ) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(
+            self.slide_within_blocks_u64x2::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x2::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.add_u64x2(a0, b0), self.add_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.sub_u64x2(a0, b0), self.sub_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.mul_u64x2(a0, b0), self.mul_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn and_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.and_u64x2(a0, b0), self.and_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn or_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.or_u64x2(a0, b0), self.or_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.xor_u64x2(a0, b0), self.xor_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u64x4(self, a: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.not_u64x2(a0), self.not_u64x2(a1))
+    }
+    #[inline(always)]
+    fn shl_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shl_u64x2(a0, shift), self.shl_u64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shlv_u64x2(a0, b0), self.shlv_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u64x4(self, a: u64x4<Self>, shift: u32) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u64x2(self.shr_u64x2(a0, shift), self.shr_u64x2(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.shrv_u64x2(a0, b0), self.shrv_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_eq_u64x2(a0, b0), self.simd_eq_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_lt_u64x2(a0, b0), self.simd_lt_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_le_u64x2(a0, b0), self.simd_le_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_ge_u64x2(a0, b0), self.simd_ge_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_mask64x2(self.simd_gt_u64x2(a0, b0), self.simd_gt_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, _) = self.split_u64x4(a);
+        let (b0, _) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a0, b0), self.zip_high_u64x2(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (_, a1) = self.split_u64x4(a);
+        let (_, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.zip_low_u64x2(a1, b1), self.zip_high_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_low_u64x2(a0, a1), self.unzip_low_u64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.unzip_high_u64x2(a0, a1), self.unzip_high_u64x2(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_lo = self.zip_low_u64x2(a0, b0);
+        let lo_hi = self.zip_high_u64x2(a0, b0);
+        let hi_lo = self.zip_low_u64x2(a1, b1);
+        let hi_hi = self.zip_high_u64x2(a1, b1);
+        (
+            self.combine_u64x2(lo_lo, lo_hi),
+            self.combine_u64x2(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> (u64x4<Self>, u64x4<Self>) {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let lo_even = self.unzip_low_u64x2(a0, a1);
+        let lo_odd = self.unzip_high_u64x2(a0, a1);
+        let hi_even = self.unzip_low_u64x2(b0, b1);
+        let hi_odd = self.unzip_high_u64x2(b0, b1);
+        (
+            self.combine_u64x2(lo_even, hi_even),
+            self.combine_u64x2(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u64x4(self, a: mask64x4<Self>, b: u64x4<Self>, c: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        let (c0, c1) = self.split_u64x4(c);
+        self.combine_u64x2(self.select_u64x2(a0, b0, c0), self.select_u64x2(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.min_u64x2(a0, b0), self.min_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x4<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        let (b0, b1) = self.split_u64x4(b);
+        self.combine_u64x2(self.max_u64x2(a0, b0), self.max_u64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn combine_u64x4(self, a: u64x4<Self>, b: u64x4<Self>) -> u64x8<Self> {
+        u64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_u64x4(self, a: u64x4<Self>) -> (u64x2<Self>, u64x2<Self>) {
+        (
+            u64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            u64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x4(self, a: u64x4<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u8x16(self.reinterpret_u8_u64x2(a0), self.reinterpret_u8_u64x2(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x4(self, a: u64x4<Self>) -> u32x8<Self> {
+        let (a0, a1) = self.split_u64x4(a);
+        self.combine_u32x4(
+            self.reinterpret_u32_u64x2(a0),
+            self.reinterpret_u32_u64x2(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_mask64x4(self, val: bool) -> mask64x4<Self> {
+        let half = self.splat_mask64x2(val);
+        self.combine_mask64x2(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
+        mask64x4 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 2usize], [i64; 4usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        let lo = self.from_bitmask_mask64x2(bits);
+        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
+        self.combine_mask64x2(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x4(a);
+        let lo = self.to_bitmask_mask64x2(lo);
+        let hi = self.to_bitmask_mask64x2(hi);
+        lo | (hi << 2usize)
+    }
+    #[inline(always)]
+    fn set_mask64x4(self, a: &mut mask64x4<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 4usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4usize
+        );
+        let mut lanes = self.as_array_mask64x4(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
+    fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
+    }
+    #[inline(always)]
+    fn select_mask64x4(
+        self,
+        a: mask64x4<Self>,
+        b: mask64x4<Self>,
+        c: mask64x4<Self>,
+    ) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        let (c0, c1) = self.split_mask64x4(c);
+        self.combine_mask64x2(
+            self.select_mask64x2(a0, b0, c0),
+            self.select_mask64x2(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
+        let (a0, a1) = self.split_mask64x4(a);
+        let (b0, b1) = self.split_mask64x4(b);
+        self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
+    }
+    #[inline(always)]
+    fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
+        let (a0, a1) = self.split_mask64x4(a);
+        self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
+    }
+    #[inline(always)]
+    fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
+        mask64x8 {
+            val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
+        (
+            mask64x2 {
+                val: crate::support::Aligned128(a.val.0[0]),
+                simd: self,
+            },
+            mask64x2 {
+                val: crate::support::Aligned128(a.val.0[1]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn splat_f32x16(self, val: f32) -> f32x16<Self> {
+        let half = self.splat_f32x8(val);
+        self.combine_f32x8(half, half)
+    }
+    #[inline(always)]
+    fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [f32; 16usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [f32; 16usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
+        f32x16 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        if SHIFT >= 16usize {
+            return b;
+        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_f32x16(a).val.0,
+            self.cvt_to_bytes_f32x16(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_f32x16<const SHIFT: usize>(
+        self,
+        a: f32x16<Self>,
+        b: f32x16<Self>,
+    ) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
+    }
+    #[inline(always)]
+    fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
+    }
+    #[inline(always)]
+    fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
+    }
+    #[inline(always)]
+    fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.approximate_recip_f32x8(a0),
+            self.approximate_recip_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, _) = self.split_f32x16(a);
+        let (b0, _) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (_, a1) = self.split_f32x16(a);
+        let (_, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_lo = self.zip_low_f32x8(a0, b0);
+        let lo_hi = self.zip_high_f32x8(a0, b0);
+        let hi_lo = self.zip_low_f32x8(a1, b1);
+        let hi_hi = self.zip_high_f32x8(a1, b1);
+        (
+            self.combine_f32x8(lo_lo, lo_hi),
+            self.combine_f32x8(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> (f32x16<Self>, f32x16<Self>) {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let lo_even = self.unzip_low_f32x8(a0, a1);
+        let lo_odd = self.unzip_high_f32x8(a0, a1);
+        let hi_even = self.unzip_low_f32x8(b0, b1);
+        let hi_odd = self.unzip_high_f32x8(b0, b1);
+        (
+            self.combine_f32x8(lo_even, hi_even),
+            self.combine_f32x8(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
+    }
+    #[inline(always)]
+    fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.max_precise_f32x8(a0, b0),
+            self.max_precise_f32x8(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        self.combine_f32x8(
+            self.min_precise_f32x8(a0, b0),
+            self.min_precise_f32x8(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_add_f32x8(a0, b0, c0),
+            self.mul_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_sub_f32x8(a0, b0, c0),
+            self.mul_sub_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
+    fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
+    }
+    #[inline(always)]
+    fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
+    }
+    #[inline(always)]
+    fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(
+            self.round_ties_even_f32x8(a0),
+            self.round_ties_even_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
+    }
+    #[inline(always)]
+    fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
     }
     #[inline(always)]
     fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
@@ -5372,2574 +6595,3151 @@ impl Simd for WasmSimd128 {
         self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+    fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
+        (
+            f32x8 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            f32x8 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_f64x4(
+            self.reinterpret_f64_f32x8(a0),
+            self.reinterpret_f64_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.reinterpret_i32_f32x8(a0),
+            self.reinterpret_i32_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
+        let (chunks, []) = src.as_chunks::<4usize>() else {
+            unreachable!()
+        };
+        let v0: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[0]);
+        let v1: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[1]);
+        let v2: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[2]);
+        let v3: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[3]);
+        let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1);
+        let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3);
+        let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1);
+        let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3);
+        let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower);
+        let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower);
+        let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper);
+        let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper);
+        let combined_lower = self.combine_f32x4(out0.simd_into(self), out1.simd_into(self));
+        let combined_upper = self.combine_f32x4(out2.simd_into(self), out3.simd_into(self));
+        self.combine_f32x8(combined_lower, combined_upper)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
+        let (lower, upper) = self.split_f32x16(a);
+        let (v0_vec, v1_vec) = self.split_f32x8(lower);
+        let (v2_vec, v3_vec) = self.split_f32x8(upper);
+        let v0: v128 = v0_vec.into();
+        let v1: v128 = v1_vec.into();
+        let v2: v128 = v2_vec.into();
+        let v3: v128 = v3_vec.into();
+        let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2);
+        let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3);
+        let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2);
+        let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3);
+        let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower);
+        let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
+        let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
+        let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
+        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out3, &mut chunks[3]);
+    }
+    #[inline(always)]
+    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_f32x8(a0),
+            self.reinterpret_u32_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
+    }
+    #[inline(always)]
+    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_u32x8(
+            self.cvt_u32_precise_f32x8(a0),
+            self.cvt_u32_precise_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
+    }
+    #[inline(always)]
+    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        self.combine_i32x8(
+            self.cvt_i32_precise_f32x8(a0),
+            self.cvt_i32_precise_f32x8(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
+        let half = self.splat_i8x32(val);
+        self.combine_i8x32(half, half)
+    }
+    #[inline(always)]
+    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [i8; 64usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
+        i8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        if SHIFT >= 64usize {
+            return b;
+        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_i8x64(a).val.0,
+            self.cvt_to_bytes_i8x64(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+        self,
+        a: i8x64<Self>,
+        b: i8x64<Self>,
+    ) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(
+            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
+    }
+    #[inline(always)]
+    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, _) = self.split_i8x64(a);
+        let (b0, _) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (_, a1) = self.split_i8x64(a);
+        let (_, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_lo = self.zip_low_i8x32(a0, b0);
+        let lo_hi = self.zip_high_i8x32(a0, b0);
+        let hi_lo = self.zip_low_i8x32(a1, b1);
+        let hi_hi = self.zip_high_i8x32(a1, b1);
+        (
+            self.combine_i8x32(lo_lo, lo_hi),
+            self.combine_i8x32(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let lo_even = self.unzip_low_i8x32(a0, a1);
+        let lo_odd = self.unzip_high_i8x32(a0, a1);
+        let hi_even = self.unzip_low_i8x32(b0, b1);
+        let hi_odd = self.unzip_high_i8x32(b0, b1);
+        (
+            self.combine_i8x32(lo_even, hi_even),
+            self.combine_i8x32(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        let (c0, c1) = self.split_i8x64(c);
+        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        let (b0, b1) = self.split_i8x64(b);
+        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+        (
+            i8x32 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            i8x32 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
+    }
+    #[inline(always)]
+    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i8x64(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i8x32(a0),
+            self.reinterpret_u32_i8x32(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
+        let half = self.splat_u8x32(val);
+        self.combine_u8x32(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [u8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [u8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [u8; 64usize]>(&mut a.val.0)
+    }
+    #[inline(always)]
+    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
+    }
+    #[inline(always)]
+    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        if SHIFT >= 64usize {
+            return b;
+        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_u8x64(a).val.0,
+            self.cvt_to_bytes_u8x64(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
+    }
+    #[inline(always)]
+    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+        self,
+        a: u8x64<Self>,
+        b: u8x64<Self>,
+    ) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(
+            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
+        )
+    }
+    #[inline(always)]
+    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
+    }
+    #[inline(always)]
+    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
+    }
+    #[inline(always)]
+    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
+    }
+    #[inline(always)]
+    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, _) = self.split_u8x64(a);
+        let (b0, _) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
+    }
+    #[inline(always)]
+    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (_, a1) = self.split_u8x64(a);
+        let (_, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
+    }
+    #[inline(always)]
+    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_lo = self.zip_low_u8x32(a0, b0);
+        let lo_hi = self.zip_high_u8x32(a0, b0);
+        let hi_lo = self.zip_low_u8x32(a1, b1);
+        let hi_hi = self.zip_high_u8x32(a1, b1);
         (
-            f32x8 {
+            self.combine_u8x32(lo_lo, lo_hi),
+            self.combine_u8x32(hi_lo, hi_hi),
+        )
+    }
+    #[inline(always)]
+    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let lo_even = self.unzip_low_u8x32(a0, a1);
+        let lo_odd = self.unzip_high_u8x32(a0, a1);
+        let hi_even = self.unzip_low_u8x32(b0, b1);
+        let hi_odd = self.unzip_high_u8x32(b0, b1);
+        (
+            self.combine_u8x32(lo_even, hi_even),
+            self.combine_u8x32(lo_odd, hi_odd),
+        )
+    }
+    #[inline(always)]
+    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        let (c0, c1) = self.split_u8x64(c);
+        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
+    }
+    #[inline(always)]
+    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        let (b0, b1) = self.split_u8x64(b);
+        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+        (
+            u8x32 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            f32x8 {
+            u8x32 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_f64x4(
-            self.reinterpret_f64_f32x8(a0),
-            self.reinterpret_f64_f32x8(a1),
+    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
+        let (chunks, []) = src.as_chunks::<16usize>() else {
+            unreachable!()
+        };
+        let v0: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[0]);
+        let v1: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[1]);
+        let v2: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[2]);
+        let v3: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[3]);
+        let v01_lower =
+            u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v0, v1);
+        let v23_lower =
+            u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v2, v3);
+        let v01_upper =
+            u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v0, v1);
+        let v23_upper =
+            u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v2, v3);
+        let out0 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(
+            v01_lower, v23_lower,
+        );
+        let out1 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
+            v01_lower, v23_lower,
+        );
+        let out2 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(
+            v01_upper, v23_upper,
+        );
+        let out3 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
+            v01_upper, v23_upper,
+        );
+        let combined_lower = self.combine_u8x16(out0.simd_into(self), out1.simd_into(self));
+        let combined_upper = self.combine_u8x16(out2.simd_into(self), out3.simd_into(self));
+        self.combine_u8x32(combined_lower, combined_upper)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+        let (lower, upper) = self.split_u8x64(a);
+        let (v0_vec, v1_vec) = self.split_u8x32(lower);
+        let (v2_vec, v3_vec) = self.split_u8x32(upper);
+        let v0: v128 = v0_vec.into();
+        let v1: v128 = v1_vec.into();
+        let v2: v128 = v2_vec.into();
+        let v3: v128 = v3_vec.into();
+        let v02_lower =
+            u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v0, v2);
+        let v13_lower =
+            u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v1, v3);
+        let v02_upper =
+            u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v0, v2);
+        let v13_upper =
+            u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v1, v3);
+        let out0 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+            v02_lower, v13_lower,
+        );
+        let out1 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+            v02_lower, v13_lower,
+        );
+        let out2 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+            v02_upper, v13_upper,
+        );
+        let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+            v02_upper, v13_upper,
+        );
+        let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out3, &mut chunks[3]);
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u8x64(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u8x32(a0),
+            self.reinterpret_u32_u8x32(a1),
         )
     }
     #[inline(always)]
-    fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.reinterpret_i32_f32x8(a0),
-            self.reinterpret_i32_f32x8(a1),
-        )
+    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
+        let half = self.splat_mask8x32(val);
+        self.combine_mask8x32(half, half)
+    }
+    #[inline(always)]
+    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
+        mask8x64 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
+    }
+    #[inline(always)]
+    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0)
+    }
+    #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        let lo = self.from_bitmask_mask8x32(bits);
+        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
+        self.combine_mask8x32(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
+    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+        assert!(
+            index < 64usize,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64usize
+        );
+        let mut lanes = self.as_array_mask8x64(*a);
+        lanes[index] = if value { !0 } else { 0 };
+        *a = self.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
+    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
+    }
+    #[inline(always)]
+    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        let (chunks, []) = src.as_chunks::<4usize>() else {
-            unreachable!()
-        };
-        let v0: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[0]);
-        let v1: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[1]);
-        let v2: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[2]);
-        let v3: v128 = crate::transmute::checked_transmute_copy::<[f32; 4usize], v128>(&chunks[3]);
-        let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1);
-        let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3);
-        let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1);
-        let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3);
-        let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower);
-        let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower);
-        let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper);
-        let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper);
-        let combined_lower = self.combine_f32x4(out0.simd_into(self), out1.simd_into(self));
-        let combined_upper = self.combine_f32x4(out2.simd_into(self), out3.simd_into(self));
-        self.combine_f32x8(combined_lower, combined_upper)
+    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        let (lower, upper) = self.split_f32x16(a);
-        let (v0_vec, v1_vec) = self.split_f32x8(lower);
-        let (v2_vec, v3_vec) = self.split_f32x8(upper);
-        let v0: v128 = v0_vec.into();
-        let v1: v128 = v1_vec.into();
-        let v2: v128 = v2_vec.into();
-        let v3: v128 = v3_vec.into();
-        let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2);
-        let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3);
-        let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2);
-        let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3);
-        let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower);
-        let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
-        let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
-        let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
-        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
-            unreachable!()
-        };
-        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out0, &mut chunks[0]);
-        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out1, &mut chunks[1]);
-        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out2, &mut chunks[2]);
-        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out3, &mut chunks[3]);
+    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
+    fn select_mask8x64(
+        self,
+        a: mask8x64<Self>,
+        b: mask8x64<Self>,
+        c: mask8x64<Self>,
+    ) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        let (c0, c1) = self.split_mask8x64(c);
+        self.combine_mask8x32(
+            self.select_mask8x32(a0, b0, c0),
+            self.select_mask8x32(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_f32x8(a0),
-            self.reinterpret_u32_f32x8(a1),
-        )
+    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
+        let (a0, a1) = self.split_mask8x64(a);
+        let (b0, b1) = self.split_mask8x64(b);
+        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
     }
     #[inline(always)]
-    fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
+    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_u32x8(
-            self.cvt_u32_precise_f32x8(a0),
-            self.cvt_u32_precise_f32x8(a1),
-        )
+    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
+    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
     }
     #[inline(always)]
-    fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_f32x16(a);
-        self.combine_i32x8(
-            self.cvt_i32_precise_f32x8(a0),
-            self.cvt_i32_precise_f32x8(a1),
+    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
+        let (a0, a1) = self.split_mask8x64(a);
+        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
+    }
+    #[inline(always)]
+    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+        (
+            mask8x32 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            mask8x32 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
         )
     }
     #[inline(always)]
-    fn splat_i8x64(self, val: i8) -> i8x64<Self> {
-        let half = self.splat_i8x32(val);
-        self.combine_i8x32(half, half)
+    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
+        let half = self.splat_i16x16(val);
+        self.combine_i16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
-        i8x64 {
+    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [i8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [i16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
+    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
-        i8x64 {
+    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
+        i16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        if SHIFT >= 64usize {
+    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        if SHIFT >= 32usize {
             return b;
         }
         let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_i8x64(a).val.0,
-            self.cvt_to_bytes_i8x64(b).val.0,
-            SHIFT,
+            self.cvt_to_bytes_i16x32(a).val.0,
+            self.cvt_to_bytes_i16x32(b).val.0,
+            SHIFT * 2usize,
         );
-        self.cvt_from_bytes_i8x64(u8x64 {
+        self.cvt_from_bytes_i16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i8x64<const SHIFT: usize>(
+    fn slide_within_blocks_i16x32<const SHIFT: usize>(
         self,
-        a: i8x64<Self>,
-        b: i8x64<Self>,
-    ) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(
-            self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
+        a: i16x32<Self>,
+        b: i16x32<Self>,
+    ) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
+    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
+    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
+    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
+    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
+    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
+    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
+    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
     }
     #[inline(always)]
-    fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
+    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
+    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
+    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
+    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
+    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
+    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
+    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
+    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
+    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, _) = self.split_i8x64(a);
-        let (b0, _) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
+    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, _) = self.split_i16x32(a);
+        let (b0, _) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (_, a1) = self.split_i8x64(a);
-        let (_, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
+    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (_, a1) = self.split_i16x32(a);
+        let (_, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
+    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
+    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(
+            self.unzip_high_i16x16(a0, a1),
+            self.unzip_high_i16x16(b0, b1),
+        )
     }
     #[inline(always)]
-    fn interleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_lo = self.zip_low_i8x32(a0, b0);
-        let lo_hi = self.zip_high_i8x32(a0, b0);
-        let hi_lo = self.zip_low_i8x32(a1, b1);
-        let hi_hi = self.zip_high_i8x32(a1, b1);
+    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_lo = self.zip_low_i16x16(a0, b0);
+        let lo_hi = self.zip_high_i16x16(a0, b0);
+        let hi_lo = self.zip_low_i16x16(a1, b1);
+        let hi_hi = self.zip_high_i16x16(a1, b1);
         (
-            self.combine_i8x32(lo_lo, lo_hi),
-            self.combine_i8x32(hi_lo, hi_hi),
+            self.combine_i16x16(lo_lo, lo_hi),
+            self.combine_i16x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> (i8x64<Self>, i8x64<Self>) {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let lo_even = self.unzip_low_i8x32(a0, a1);
-        let lo_odd = self.unzip_high_i8x32(a0, a1);
-        let hi_even = self.unzip_low_i8x32(b0, b1);
-        let hi_odd = self.unzip_high_i8x32(b0, b1);
+    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let lo_even = self.unzip_low_i16x16(a0, a1);
+        let lo_odd = self.unzip_high_i16x16(a0, a1);
+        let hi_even = self.unzip_low_i16x16(b0, b1);
+        let hi_odd = self.unzip_high_i16x16(b0, b1);
         (
-            self.combine_i8x32(lo_even, hi_even),
-            self.combine_i8x32(lo_odd, hi_odd),
+            self.combine_i16x16(lo_even, hi_even),
+            self.combine_i16x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        let (c0, c1) = self.split_i8x64(c);
-        self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
+    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        let (c0, c1) = self.split_i16x32(c);
+        self.combine_i16x16(
+            self.select_i16x16(a0, b0, c0),
+            self.select_i16x16(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
+    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        let (b0, b1) = self.split_i8x64(b);
-        self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
+    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        let (b0, b1) = self.split_i16x32(b);
+        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
     }
     #[inline(always)]
-    fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
+    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
         (
-            i8x32 {
+            i16x16 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            i8x32 {
+            i16x16 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
+    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i8x64(a);
-        self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
+    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_i16x16(a0),
+            self.reinterpret_u8_i16x16(a1),
+        )
     }
     #[inline(always)]
-    fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i8x64(a);
+    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i16x32(a);
         self.combine_u32x8(
-            self.reinterpret_u32_i8x32(a0),
-            self.reinterpret_u32_i8x32(a1),
+            self.reinterpret_u32_i16x16(a0),
+            self.reinterpret_u32_i16x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_u8x64(self, val: u8) -> u8x64<Self> {
-        let half = self.splat_u8x32(val);
-        self.combine_u8x32(half, half)
+    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
+        let half = self.splat_u16x16(val);
+        self.combine_u16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
-        u8x64 {
+    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [u8; 64usize]>(&a.val.0)
+    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [u8; 64usize]>(&a.val.0)
+    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [u16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [u8; 64usize]>(&mut a.val.0)
+    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [u16; 32usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
+    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        u8x64 {
+    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
+        u16x32 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        if SHIFT >= 64usize {
+    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        if SHIFT >= 32usize {
             return b;
         }
         let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_u8x64(a).val.0,
-            self.cvt_to_bytes_u8x64(b).val.0,
-            SHIFT,
+            self.cvt_to_bytes_u16x32(a).val.0,
+            self.cvt_to_bytes_u16x32(b).val.0,
+            SHIFT * 2usize,
         );
-        self.cvt_from_bytes_u8x64(u8x64 {
+        self.cvt_from_bytes_u16x32(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u8x64<const SHIFT: usize>(
+    fn slide_within_blocks_u16x32<const SHIFT: usize>(
         self,
-        a: u8x64<Self>,
-        b: u8x64<Self>,
-    ) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(
-            self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
+        a: u16x32<Self>,
+        b: u16x32<Self>,
+    ) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
+    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
+    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
+    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
+    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
+    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
+    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
+    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
     }
     #[inline(always)]
-    fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
+    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
+    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
+    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
+    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
+    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
+    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
+    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
+    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
+    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, _) = self.split_u8x64(a);
-        let (b0, _) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
+    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, _) = self.split_u16x32(a);
+        let (b0, _) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (_, a1) = self.split_u8x64(a);
-        let (_, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
+    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (_, a1) = self.split_u16x32(a);
+        let (_, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
+    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
+    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(
+            self.unzip_high_u16x16(a0, a1),
+            self.unzip_high_u16x16(b0, b1),
+        )
     }
     #[inline(always)]
-    fn interleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_lo = self.zip_low_u8x32(a0, b0);
-        let lo_hi = self.zip_high_u8x32(a0, b0);
-        let hi_lo = self.zip_low_u8x32(a1, b1);
-        let hi_hi = self.zip_high_u8x32(a1, b1);
+    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_lo = self.zip_low_u16x16(a0, b0);
+        let lo_hi = self.zip_high_u16x16(a0, b0);
+        let hi_lo = self.zip_low_u16x16(a1, b1);
+        let hi_hi = self.zip_high_u16x16(a1, b1);
         (
-            self.combine_u8x32(lo_lo, lo_hi),
-            self.combine_u8x32(hi_lo, hi_hi),
+            self.combine_u16x16(lo_lo, lo_hi),
+            self.combine_u16x16(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> (u8x64<Self>, u8x64<Self>) {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let lo_even = self.unzip_low_u8x32(a0, a1);
-        let lo_odd = self.unzip_high_u8x32(a0, a1);
-        let hi_even = self.unzip_low_u8x32(b0, b1);
-        let hi_odd = self.unzip_high_u8x32(b0, b1);
+    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let lo_even = self.unzip_low_u16x16(a0, a1);
+        let lo_odd = self.unzip_high_u16x16(a0, a1);
+        let hi_even = self.unzip_low_u16x16(b0, b1);
+        let hi_odd = self.unzip_high_u16x16(b0, b1);
         (
-            self.combine_u8x32(lo_even, hi_even),
-            self.combine_u8x32(lo_odd, hi_odd),
+            self.combine_u16x16(lo_even, hi_even),
+            self.combine_u16x16(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        let (c0, c1) = self.split_u8x64(c);
-        self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
+    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        let (c0, c1) = self.split_u16x32(c);
+        self.combine_u16x16(
+            self.select_u16x16(a0, b0, c0),
+            self.select_u16x16(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
+    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u8x64(a);
-        let (b0, b1) = self.split_u8x64(b);
-        self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
+    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        let (b0, b1) = self.split_u16x32(b);
+        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
     }
     #[inline(always)]
-    fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
+    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
         (
-            u8x32 {
+            u16x16 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            u8x32 {
+            u16x16 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
-        let (chunks, []) = src.as_chunks::<16usize>() else {
+    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
+        let (chunks, []) = src.as_chunks::<8usize>() else {
             unreachable!()
         };
-        let v0: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[0]);
-        let v1: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[1]);
-        let v2: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[2]);
-        let v3: v128 = crate::transmute::checked_transmute_copy::<[u8; 16usize], v128>(&chunks[3]);
-        let v01_lower =
-            u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v0, v1);
-        let v23_lower =
-            u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v2, v3);
-        let v01_upper =
-            u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v0, v1);
-        let v23_upper =
-            u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v2, v3);
-        let out0 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(
-            v01_lower, v23_lower,
-        );
-        let out1 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
-            v01_lower, v23_lower,
-        );
-        let out2 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(
-            v01_upper, v23_upper,
-        );
-        let out3 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>(
-            v01_upper, v23_upper,
-        );
-        let combined_lower = self.combine_u8x16(out0.simd_into(self), out1.simd_into(self));
-        let combined_upper = self.combine_u8x16(out2.simd_into(self), out3.simd_into(self));
-        self.combine_u8x32(combined_lower, combined_upper)
+        let v0: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[0]);
+        let v1: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[1]);
+        let v2: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[2]);
+        let v3: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[3]);
+        let v01_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v0, v1);
+        let v23_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v2, v3);
+        let v01_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v0, v1);
+        let v23_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v2, v3);
+        let out0 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_lower, v23_lower);
+        let out1 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_lower, v23_lower);
+        let out2 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_upper, v23_upper);
+        let out3 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_upper, v23_upper);
+        let combined_lower = self.combine_u16x8(out0.simd_into(self), out1.simd_into(self));
+        let combined_upper = self.combine_u16x8(out2.simd_into(self), out3.simd_into(self));
+        self.combine_u16x16(combined_lower, combined_upper)
     }
     #[inline(always)]
-    fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
-        let (lower, upper) = self.split_u8x64(a);
-        let (v0_vec, v1_vec) = self.split_u8x32(lower);
-        let (v2_vec, v3_vec) = self.split_u8x32(upper);
+    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+        let (lower, upper) = self.split_u16x32(a);
+        let (v0_vec, v1_vec) = self.split_u16x16(lower);
+        let (v2_vec, v3_vec) = self.split_u16x16(upper);
         let v0: v128 = v0_vec.into();
         let v1: v128 = v1_vec.into();
         let v2: v128 = v2_vec.into();
         let v3: v128 = v3_vec.into();
-        let v02_lower =
-            u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v0, v2);
-        let v13_lower =
-            u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v1, v3);
-        let v02_upper =
-            u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v0, v2);
-        let v13_upper =
-            u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v1, v3);
-        let out0 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
-            v02_lower, v13_lower,
-        );
-        let out1 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
-            v02_lower, v13_lower,
-        );
-        let out2 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
-            v02_upper, v13_upper,
-        );
-        let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
-            v02_upper, v13_upper,
-        );
-        let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+        let v02_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v0, v2);
+        let v13_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v1, v3);
+        let v02_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v0, v2);
+        let v13_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v1, v3);
+        let out0 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_lower, v13_lower);
+        let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower);
+        let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper);
+        let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper);
+        let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
             unreachable!()
         };
-        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out0, &mut chunks[0]);
-        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out1, &mut chunks[1]);
-        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out2, &mut chunks[2]);
-        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out3, &mut chunks[3]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
-    fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u8x64(a);
+    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u16x32(a);
+        self.combine_u8x32(
+            self.reinterpret_u8_u16x16(a0),
+            self.reinterpret_u8_u16x16(a1),
+        )
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u16x32(a);
         self.combine_u32x8(
-            self.reinterpret_u32_u8x32(a0),
-            self.reinterpret_u32_u8x32(a1),
+            self.reinterpret_u32_u16x16(a0),
+            self.reinterpret_u32_u16x16(a1),
         )
     }
     #[inline(always)]
-    fn splat_mask8x64(self, val: bool) -> mask8x64<Self> {
-        let half = self.splat_mask8x32(val);
-        self.combine_mask8x32(half, half)
+    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
+        let half = self.splat_mask16x16(val);
+        self.combine_mask16x16(half, half)
     }
     #[inline(always)]
-    fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
-        mask8x64 {
+    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
+        mask16x32 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i8; 64usize]>(&a.val.0)
+    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
-        let lo = self.from_bitmask_mask8x32(bits);
-        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
-        self.combine_mask8x32(lo, hi)
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
     }
     #[inline(always)]
-    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
-        let (lo, hi) = self.split_mask8x64(a);
-        let lo = self.to_bitmask_mask8x32(lo);
-        let hi = self.to_bitmask_mask8x32(hi);
-        lo | (hi << 32usize)
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x32(a);
+        let lo = self.to_bitmask_mask16x16(lo);
+        let hi = self.to_bitmask_mask16x16(hi);
+        lo | (hi << 16usize)
     }
     #[inline(always)]
-    fn set_mask8x64(self, a: &mut mask8x64<Self>, index: usize, value: bool) -> () {
+    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 64usize,
+            index < 32usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            64usize
+            32usize
         );
-        let mut lanes = self.as_array_mask8x64(*a);
+        let mut lanes = self.as_array_mask16x32(*a);
         lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask8x64(lanes);
+        *a = self.load_array_mask16x32(lanes);
     }
     #[inline(always)]
-    fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
+    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
+    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
+    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
     }
     #[inline(always)]
-    fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
+    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
     }
     #[inline(always)]
-    fn select_mask8x64(
+    fn select_mask16x32(
         self,
-        a: mask8x64<Self>,
-        b: mask8x64<Self>,
-        c: mask8x64<Self>,
-    ) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        let (c0, c1) = self.split_mask8x64(c);
-        self.combine_mask8x32(
-            self.select_mask8x32(a0, b0, c0),
-            self.select_mask8x32(a1, b1, c1),
+        a: mask16x32<Self>,
+        b: mask16x32<Self>,
+        c: mask16x32<Self>,
+    ) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        let (c0, c1) = self.split_mask16x32(c);
+        self.combine_mask16x16(
+            self.select_mask16x16(a0, b0, c0),
+            self.select_mask16x16(a1, b1, c1),
         )
     }
     #[inline(always)]
-    fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
-        let (a0, a1) = self.split_mask8x64(a);
-        let (b0, b1) = self.split_mask8x64(b);
-        self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
+    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
+        let (a0, a1) = self.split_mask16x32(a);
+        let (b0, b1) = self.split_mask16x32(b);
+        self.combine_mask16x16(
+            self.simd_eq_mask16x16(a0, b0),
+            self.simd_eq_mask16x16(a1, b1),
+        )
     }
     #[inline(always)]
-    fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
+    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
     }
     #[inline(always)]
-    fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
+    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
     }
     #[inline(always)]
-    fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
+    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
     }
     #[inline(always)]
-    fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
-        let (a0, a1) = self.split_mask8x64(a);
-        self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
+    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
+        let (a0, a1) = self.split_mask16x32(a);
+        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
     }
     #[inline(always)]
-    fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
+    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
         (
-            mask8x32 {
+            mask16x16 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            mask8x32 {
+            mask16x16 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i16x32(self, val: i16) -> i16x32<Self> {
-        let half = self.splat_i16x16(val);
-        self.combine_i16x16(half, half)
+    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
+        let half = self.splat_i32x8(val);
+        self.combine_i32x8(half, half)
     }
     #[inline(always)]
-    fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
-        i16x32 {
+    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0)
+    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [i16; 32usize]>(&a.val.0)
+    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [i16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [i32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
+    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
-        i16x32 {
+    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
+        i32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        if SHIFT >= 16usize {
             return b;
-        }
-        let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_i16x32(a).val.0,
-            self.cvt_to_bytes_i16x32(b).val.0,
-            SHIFT * 2usize,
+        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_i32x16(a).val.0,
+            self.cvt_to_bytes_i32x16(b).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_i16x32(u8x64 {
+        self.cvt_from_bytes_i32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i16x32<const SHIFT: usize>(
+    fn slide_within_blocks_i32x16<const SHIFT: usize>(
         self,
-        a: i16x32<Self>,
-        b: i16x32<Self>,
-    ) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
+        a: i32x16<Self>,
+        b: i32x16<Self>,
+    ) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(
+            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
+    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
+    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
+    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
+    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
+    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
+    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
+    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
     }
     #[inline(always)]
-    fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
+    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
+    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
+    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
+    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
+    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
+    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
+    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
+    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
+    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, _) = self.split_i16x32(a);
-        let (b0, _) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
+    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, _) = self.split_i32x16(a);
+        let (b0, _) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (_, a1) = self.split_i16x32(a);
-        let (_, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
+    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (_, a1) = self.split_i32x16(a);
+        let (_, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
+    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(
-            self.unzip_high_i16x16(a0, a1),
-            self.unzip_high_i16x16(b0, b1),
-        )
+    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_lo = self.zip_low_i16x16(a0, b0);
-        let lo_hi = self.zip_high_i16x16(a0, b0);
-        let hi_lo = self.zip_low_i16x16(a1, b1);
-        let hi_hi = self.zip_high_i16x16(a1, b1);
+    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_lo = self.zip_low_i32x8(a0, b0);
+        let lo_hi = self.zip_high_i32x8(a0, b0);
+        let hi_lo = self.zip_low_i32x8(a1, b1);
+        let hi_hi = self.zip_high_i32x8(a1, b1);
         (
-            self.combine_i16x16(lo_lo, lo_hi),
-            self.combine_i16x16(hi_lo, hi_hi),
+            self.combine_i32x8(lo_lo, lo_hi),
+            self.combine_i32x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> (i16x32<Self>, i16x32<Self>) {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let lo_even = self.unzip_low_i16x16(a0, a1);
-        let lo_odd = self.unzip_high_i16x16(a0, a1);
-        let hi_even = self.unzip_low_i16x16(b0, b1);
-        let hi_odd = self.unzip_high_i16x16(b0, b1);
+    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let lo_even = self.unzip_low_i32x8(a0, a1);
+        let lo_odd = self.unzip_high_i32x8(a0, a1);
+        let hi_even = self.unzip_low_i32x8(b0, b1);
+        let hi_odd = self.unzip_high_i32x8(b0, b1);
         (
-            self.combine_i16x16(lo_even, hi_even),
-            self.combine_i16x16(lo_odd, hi_odd),
+            self.combine_i32x8(lo_even, hi_even),
+            self.combine_i32x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        let (c0, c1) = self.split_i16x32(c);
-        self.combine_i16x16(
-            self.select_i16x16(a0, b0, c0),
-            self.select_i16x16(a1, b1, c1),
-        )
+    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        let (c0, c1) = self.split_i32x16(c);
+        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
+    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        let (b0, b1) = self.split_i16x32(b);
-        self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
+    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        let (b0, b1) = self.split_i32x16(b);
+        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
+    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
         (
-            i16x16 {
+            i32x8 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            i16x16 {
+            i32x8 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
+    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_i16x16(a0),
-            self.reinterpret_u8_i16x16(a1),
+    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i32x8(a0),
+            self.reinterpret_u32_i32x8(a1),
         )
     }
     #[inline(always)]
-    fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i16x32(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_i16x16(a0),
-            self.reinterpret_u32_i16x16(a1),
-        )
+    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_i32x16(a);
+        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
     }
     #[inline(always)]
-    fn splat_u16x32(self, val: u16) -> u16x32<Self> {
-        let half = self.splat_u16x16(val);
-        self.combine_u16x16(half, half)
+    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
+        let half = self.splat_u32x8(val);
+        self.combine_u32x8(half, half)
     }
     #[inline(always)]
-    fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
-        u16x32 {
+    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [u16; 32usize]>(&a.val.0)
+    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [u16; 32usize]>(&a.val.0)
+    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [u32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [u16; 32usize]>(&mut a.val.0)
+    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [u32; 16usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
+    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
-        u16x32 {
+    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
+        u32x16 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        if SHIFT >= 32usize {
+    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        if SHIFT >= 16usize {
             return b;
         }
         let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_u16x32(a).val.0,
-            self.cvt_to_bytes_u16x32(b).val.0,
-            SHIFT * 2usize,
+            self.cvt_to_bytes_u32x16(a).val.0,
+            self.cvt_to_bytes_u32x16(b).val.0,
+            SHIFT * 4usize,
         );
-        self.cvt_from_bytes_u16x32(u8x64 {
+        self.cvt_from_bytes_u32x16(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_u16x32<const SHIFT: usize>(
+    fn slide_within_blocks_u32x16<const SHIFT: usize>(
         self,
-        a: u16x32<Self>,
-        b: u16x32<Self>,
-    ) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
+        a: u32x16<Self>,
+        b: u32x16<Self>,
+    ) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(
+            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
+    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
+    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
+    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
+    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
+    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
+    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
+    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
     }
     #[inline(always)]
-    fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
+    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
     }
     #[inline(always)]
-    fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
+    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
+    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
     }
     #[inline(always)]
-    fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
+    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
+    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
+    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
+    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
+    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
+    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, _) = self.split_u16x32(a);
-        let (b0, _) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
+    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, _) = self.split_u32x16(a);
+        let (b0, _) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
     }
     #[inline(always)]
-    fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (_, a1) = self.split_u16x32(a);
-        let (_, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
+    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (_, a1) = self.split_u32x16(a);
+        let (_, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
+    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
     }
     #[inline(always)]
-    fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(
-            self.unzip_high_u16x16(a0, a1),
-            self.unzip_high_u16x16(b0, b1),
-        )
+    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
     }
     #[inline(always)]
-    fn interleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_lo = self.zip_low_u16x16(a0, b0);
-        let lo_hi = self.zip_high_u16x16(a0, b0);
-        let hi_lo = self.zip_low_u16x16(a1, b1);
-        let hi_hi = self.zip_high_u16x16(a1, b1);
+    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_lo = self.zip_low_u32x8(a0, b0);
+        let lo_hi = self.zip_high_u32x8(a0, b0);
+        let hi_lo = self.zip_low_u32x8(a1, b1);
+        let hi_hi = self.zip_high_u32x8(a1, b1);
         (
-            self.combine_u16x16(lo_lo, lo_hi),
-            self.combine_u16x16(hi_lo, hi_hi),
+            self.combine_u32x8(lo_lo, lo_hi),
+            self.combine_u32x8(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn deinterleave_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> (u16x32<Self>, u16x32<Self>) {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let lo_even = self.unzip_low_u16x16(a0, a1);
-        let lo_odd = self.unzip_high_u16x16(a0, a1);
-        let hi_even = self.unzip_low_u16x16(b0, b1);
-        let hi_odd = self.unzip_high_u16x16(b0, b1);
+    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let lo_even = self.unzip_low_u32x8(a0, a1);
+        let lo_odd = self.unzip_high_u32x8(a0, a1);
+        let hi_even = self.unzip_low_u32x8(b0, b1);
+        let hi_odd = self.unzip_high_u32x8(b0, b1);
         (
-            self.combine_u16x16(lo_even, hi_even),
-            self.combine_u16x16(lo_odd, hi_odd),
+            self.combine_u32x8(lo_even, hi_even),
+            self.combine_u32x8(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        let (c0, c1) = self.split_u16x32(c);
-        self.combine_u16x16(
-            self.select_u16x16(a0, b0, c0),
-            self.select_u16x16(a1, b1, c1),
-        )
+    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        let (c0, c1) = self.split_u32x16(c);
+        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
     }
     #[inline(always)]
-    fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
+    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        let (b0, b1) = self.split_u16x32(b);
-        self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
+    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        let (b0, b1) = self.split_u32x16(b);
+        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
     }
     #[inline(always)]
-    fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
+    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
         (
-            u16x16 {
+            u32x8 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            u16x16 {
+            u32x8 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
-        let (chunks, []) = src.as_chunks::<8usize>() else {
+    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
+        let (chunks, []) = src.as_chunks::<4usize>() else {
             unreachable!()
         };
-        let v0: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[0]);
-        let v1: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[1]);
-        let v2: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[2]);
-        let v3: v128 = crate::transmute::checked_transmute_copy::<[u16; 8usize], v128>(&chunks[3]);
-        let v01_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v0, v1);
-        let v23_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v2, v3);
-        let v01_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v0, v1);
-        let v23_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v2, v3);
-        let out0 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_lower, v23_lower);
-        let out1 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_lower, v23_lower);
-        let out2 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_upper, v23_upper);
-        let out3 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_upper, v23_upper);
-        let combined_lower = self.combine_u16x8(out0.simd_into(self), out1.simd_into(self));
-        let combined_upper = self.combine_u16x8(out2.simd_into(self), out3.simd_into(self));
-        self.combine_u16x16(combined_lower, combined_upper)
+        let v0: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[0]);
+        let v1: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[1]);
+        let v2: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[2]);
+        let v3: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[3]);
+        let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1);
+        let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3);
+        let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1);
+        let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3);
+        let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower);
+        let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower);
+        let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper);
+        let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper);
+        let combined_lower = self.combine_u32x4(out0.simd_into(self), out1.simd_into(self));
+        let combined_upper = self.combine_u32x4(out2.simd_into(self), out3.simd_into(self));
+        self.combine_u32x8(combined_lower, combined_upper)
     }
     #[inline(always)]
-    fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
-        let (lower, upper) = self.split_u16x32(a);
-        let (v0_vec, v1_vec) = self.split_u16x16(lower);
-        let (v2_vec, v3_vec) = self.split_u16x16(upper);
+    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
+        let (lower, upper) = self.split_u32x16(a);
+        let (v0_vec, v1_vec) = self.split_u32x8(lower);
+        let (v2_vec, v3_vec) = self.split_u32x8(upper);
         let v0: v128 = v0_vec.into();
         let v1: v128 = v1_vec.into();
         let v2: v128 = v2_vec.into();
         let v3: v128 = v3_vec.into();
-        let v02_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v0, v2);
-        let v13_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v1, v3);
-        let v02_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v0, v2);
-        let v13_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v1, v3);
-        let out0 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_lower, v13_lower);
-        let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower);
-        let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper);
-        let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper);
-        let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+        let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2);
+        let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3);
+        let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2);
+        let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3);
+        let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower);
+        let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
+        let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
+        let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
+        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
             unreachable!()
         };
-        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out0, &mut chunks[0]);
-        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out1, &mut chunks[1]);
-        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out2, &mut chunks[2]);
-        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out3, &mut chunks[3]);
-    }
-    #[inline(always)]
-    fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
-    fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u8x32(
-            self.reinterpret_u8_u16x16(a0),
-            self.reinterpret_u8_u16x16(a1),
-        )
+    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
     }
     #[inline(always)]
-    fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u16x32(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_u16x16(a0),
-            self.reinterpret_u32_u16x16(a1),
-        )
+    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_u32x16(a);
+        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
     }
     #[inline(always)]
-    fn splat_mask16x32(self, val: bool) -> mask16x32<Self> {
-        let half = self.splat_mask16x16(val);
-        self.combine_mask16x16(half, half)
+    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
+        let half = self.splat_mask32x8(val);
+        self.combine_mask32x8(half, half)
     }
     #[inline(always)]
-    fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
-        mask16x32 {
+    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
+        mask32x16 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i16; 32usize]>(&a.val.0)
+    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
-        let lo = self.from_bitmask_mask16x16(bits);
-        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
-        self.combine_mask16x16(lo, hi)
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
     }
     #[inline(always)]
-    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
-        let (lo, hi) = self.split_mask16x32(a);
-        let lo = self.to_bitmask_mask16x16(lo);
-        let hi = self.to_bitmask_mask16x16(hi);
-        lo | (hi << 16usize)
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
     }
     #[inline(always)]
-    fn set_mask16x32(self, a: &mut mask16x32<Self>, index: usize, value: bool) -> () {
+    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
         assert!(
-            index < 32usize,
+            index < 16usize,
             "mask lane index {index} is out of bounds for {} lanes",
-            32usize
+            16usize
         );
-        let mut lanes = self.as_array_mask16x32(*a);
+        let mut lanes = self.as_array_mask32x16(*a);
         lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask16x32(lanes);
+        *a = self.load_array_mask32x16(lanes);
     }
     #[inline(always)]
-    fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
+    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
+    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
+    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
+    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
     }
     #[inline(always)]
-    fn select_mask16x32(
+    fn select_mask32x16(
         self,
-        a: mask16x32<Self>,
-        b: mask16x32<Self>,
-        c: mask16x32<Self>,
-    ) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        let (c0, c1) = self.split_mask16x32(c);
-        self.combine_mask16x16(
-            self.select_mask16x16(a0, b0, c0),
-            self.select_mask16x16(a1, b1, c1),
+        a: mask32x16<Self>,
+        b: mask32x16<Self>,
+        c: mask32x16<Self>,
+    ) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        let (c0, c1) = self.split_mask32x16(c);
+        self.combine_mask32x8(
+            self.select_mask32x8(a0, b0, c0),
+            self.select_mask32x8(a1, b1, c1),
         )
     }
-    #[inline(always)]
-    fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
-        let (a0, a1) = self.split_mask16x32(a);
-        let (b0, b1) = self.split_mask16x32(b);
-        self.combine_mask16x16(
-            self.simd_eq_mask16x16(a0, b0),
-            self.simd_eq_mask16x16(a1, b1),
-        )
+    #[inline(always)]
+    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
+        let (a0, a1) = self.split_mask32x16(a);
+        let (b0, b1) = self.split_mask32x16(b);
+        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
     }
     #[inline(always)]
-    fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
+    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
+    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
     }
     #[inline(always)]
-    fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
+    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
-        let (a0, a1) = self.split_mask16x32(a);
-        self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
+    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
+        let (a0, a1) = self.split_mask32x16(a);
+        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
     }
     #[inline(always)]
-    fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
+    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
         (
-            mask16x16 {
+            mask32x8 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            mask16x16 {
+            mask32x8 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_i32x16(self, val: i32) -> i32x16<Self> {
-        let half = self.splat_i32x8(val);
-        self.combine_i32x8(half, half)
+    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
+        let half = self.splat_f64x4(val);
+        self.combine_f64x4(half, half)
     }
     #[inline(always)]
-    fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
-        i32x16 {
+    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0)
+    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [i32; 16usize]>(&a.val.0)
+    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [f64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [i32; 16usize]>(&mut a.val.0)
+    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [f64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
+    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
-        i32x16 {
+    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
+        f64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        if SHIFT >= 16usize {
+    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_i32x16(a).val.0,
-            self.cvt_to_bytes_i32x16(b).val.0,
-            SHIFT * 4usize,
+            self.cvt_to_bytes_f64x8(a).val.0,
+            self.cvt_to_bytes_f64x8(b).val.0,
+            SHIFT * 8usize,
         );
-        self.cvt_from_bytes_i32x16(u8x64 {
+        self.cvt_from_bytes_f64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_i32x16<const SHIFT: usize>(
+    fn slide_within_blocks_f64x8<const SHIFT: usize>(
         self,
-        a: i32x16<Self>,
-        b: i32x16<Self>,
-    ) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(
-            self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
+        a: f64x8<Self>,
+        b: f64x8<Self>,
+    ) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
-    }
-    #[inline(always)]
-    fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
-    }
-    #[inline(always)]
-    fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, _) = self.split_i32x16(a);
-        let (b0, _) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
-    }
-    #[inline(always)]
-    fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (_, a1) = self.split_i32x16(a);
-        let (_, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
-    }
-    #[inline(always)]
-    fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
+    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
     }
     #[inline(always)]
-    fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
+    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
     }
     #[inline(always)]
-    fn interleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_lo = self.zip_low_i32x8(a0, b0);
-        let lo_hi = self.zip_high_i32x8(a0, b0);
-        let hi_lo = self.zip_low_i32x8(a1, b1);
-        let hi_hi = self.zip_high_i32x8(a1, b1);
-        (
-            self.combine_i32x8(lo_lo, lo_hi),
-            self.combine_i32x8(hi_lo, hi_hi),
-        )
+    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
     }
     #[inline(always)]
-    fn deinterleave_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> (i32x16<Self>, i32x16<Self>) {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let lo_even = self.unzip_low_i32x8(a0, a1);
-        let lo_odd = self.unzip_high_i32x8(a0, a1);
-        let hi_even = self.unzip_low_i32x8(b0, b1);
-        let hi_odd = self.unzip_high_i32x8(b0, b1);
-        (
-            self.combine_i32x8(lo_even, hi_even),
-            self.combine_i32x8(lo_odd, hi_odd),
+    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.approximate_recip_f64x4(a0),
+            self.approximate_recip_f64x4(a1),
         )
     }
     #[inline(always)]
-    fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        let (c0, c1) = self.split_i32x16(c);
-        self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
+    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
+    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        let (b0, b1) = self.split_i32x16(b);
-        self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
+    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
-        (
-            i32x8 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
-                simd: self,
-            },
-            i32x8 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
-                simd: self,
-            },
-        )
+    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
+    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
+    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_u32x8(
-            self.reinterpret_u32_i32x8(a0),
-            self.reinterpret_u32_i32x8(a1),
-        )
+    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_i32x16(a);
-        self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
+    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn splat_u32x16(self, val: u32) -> u32x16<Self> {
-        let half = self.splat_u32x8(val);
-        self.combine_u32x8(half, half)
+    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
-        u32x16 {
-            val: crate::transmute::checked_transmute_copy(val),
-            simd: self,
-        }
+    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, _) = self.split_f64x8(a);
+        let (b0, _) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
     }
     #[inline(always)]
-    fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [u32; 16usize]>(&a.val.0)
+    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (_, a1) = self.split_f64x8(a);
+        let (_, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [u32; 16usize]>(&a.val.0)
+    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
     }
     #[inline(always)]
-    fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [u32; 16usize]>(&mut a.val.0)
+    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
     }
     #[inline(always)]
-    fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        crate::transmute::checked_transmute_store(a.val.0, dest);
+    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_lo = self.zip_low_f64x4(a0, b0);
+        let lo_hi = self.zip_high_f64x4(a0, b0);
+        let hi_lo = self.zip_low_f64x4(a1, b1);
+        let hi_hi = self.zip_high_f64x4(a1, b1);
+        (
+            self.combine_f64x4(lo_lo, lo_hi),
+            self.combine_f64x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
-        u32x16 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let lo_even = self.unzip_low_f64x4(a0, a1);
+        let lo_odd = self.unzip_high_f64x4(a0, a1);
+        let hi_even = self.unzip_low_f64x4(b0, b1);
+        let hi_odd = self.unzip_high_f64x4(b0, b1);
+        (
+            self.combine_f64x4(lo_even, hi_even),
+            self.combine_f64x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
-        u8x64 {
-            val: crate::transmute::checked_transmute_copy(&a.val),
-            simd: self,
-        }
+    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        if SHIFT >= 16usize {
-            return b;
-        }
-        let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_u32x16(a).val.0,
-            self.cvt_to_bytes_u32x16(b).val.0,
-            SHIFT * 4usize,
-        );
-        self.cvt_from_bytes_u32x16(u8x64 {
-            val: crate::support::Aligned512(result),
-            simd: self,
-        })
+    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
     }
     #[inline(always)]
-    fn slide_within_blocks_u32x16<const SHIFT: usize>(
-        self,
-        a: u32x16<Self>,
-        b: u32x16<Self>,
-    ) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(
-            self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
-            self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
+    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.max_precise_f64x4(a0, b0),
+            self.max_precise_f64x4(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
+    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        self.combine_f64x4(
+            self.min_precise_f64x4(a0, b0),
+            self.min_precise_f64x4(a1, b1),
+        )
     }
     #[inline(always)]
-    fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
+    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_add_f64x4(a0, b0, c0),
+            self.mul_add_f64x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
+    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_sub_f64x4(a0, b0, c0),
+            self.mul_sub_f64x4(a1, b1, c1),
+        )
     }
     #[inline(always)]
-    fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
+    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
     }
     #[inline(always)]
-    fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
+    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
     }
     #[inline(always)]
-    fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
+    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(
+            self.round_ties_even_f64x4(a0),
+            self.round_ties_even_f64x4(a1),
+        )
     }
     #[inline(always)]
-    fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
+    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
     }
     #[inline(always)]
-    fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
+    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
     }
     #[inline(always)]
-    fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
+    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
+    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+        (
+            f64x4 {
+                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
+                simd: self,
+            },
+            f64x4 {
+                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
+                simd: self,
+            },
+        )
     }
     #[inline(always)]
-    fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
+    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        self.combine_f32x8(
+            self.reinterpret_f32_f64x4(a0),
+            self.reinterpret_f32_f64x4(a1),
+        )
     }
     #[inline(always)]
-    fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
+    fn splat_i64x8(self, val: i64) -> i64x8<Self> {
+        let half = self.splat_i64x4(val);
+        self.combine_i64x4(half, half)
     }
     #[inline(always)]
-    fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
+    fn load_array_i64x8(self, val: [i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
+            val: crate::transmute::checked_transmute_copy(&val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
+    fn load_array_ref_i64x8(self, val: &[i64; 8usize]) -> i64x8<Self> {
+        i64x8 {
+            val: crate::transmute::checked_transmute_copy(val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
+    fn as_array_i64x8(self, a: i64x8<Self>) -> [i64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
+    fn as_array_ref_i64x8(self, a: &i64x8<Self>) -> &[i64; 8usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [i64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, _) = self.split_u32x16(a);
-        let (b0, _) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
+    fn as_array_mut_i64x8(self, a: &mut i64x8<Self>) -> &mut [i64; 8usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [i64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (_, a1) = self.split_u32x16(a);
-        let (_, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
+    fn store_array_i64x8(self, a: i64x8<Self>, dest: &mut [i64; 8usize]) -> () {
+        crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
+    fn cvt_from_bytes_i64x8(self, a: u8x64<Self>) -> i64x8<Self> {
+        i64x8 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
+    fn cvt_to_bytes_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        u8x64 {
+            val: crate::transmute::checked_transmute_copy(&a.val),
+            simd: self,
+        }
     }
     #[inline(always)]
-    fn interleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_lo = self.zip_low_u32x8(a0, b0);
-        let lo_hi = self.zip_high_u32x8(a0, b0);
-        let hi_lo = self.zip_low_u32x8(a1, b1);
-        let hi_hi = self.zip_high_u32x8(a1, b1);
-        (
-            self.combine_u32x8(lo_lo, lo_hi),
-            self.combine_u32x8(hi_lo, hi_hi),
-        )
+    fn slide_i64x8<const SHIFT: usize>(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        if SHIFT >= 8usize {
+            return b;
+        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_i64x8(a).val.0,
+            self.cvt_to_bytes_i64x8(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_i64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
-    fn deinterleave_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> (u32x16<Self>, u32x16<Self>) {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let lo_even = self.unzip_low_u32x8(a0, a1);
-        let lo_odd = self.unzip_high_u32x8(a0, a1);
-        let hi_even = self.unzip_low_u32x8(b0, b1);
-        let hi_odd = self.unzip_high_u32x8(b0, b1);
-        (
-            self.combine_u32x8(lo_even, hi_even),
-            self.combine_u32x8(lo_odd, hi_odd),
+    fn slide_within_blocks_i64x8<const SHIFT: usize>(
+        self,
+        a: i64x8<Self>,
+        b: i64x8<Self>,
+    ) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(
+            self.slide_within_blocks_i64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_i64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        let (c0, c1) = self.split_u32x16(c);
-        self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
+    fn add_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.add_i64x4(a0, b0), self.add_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
+    fn sub_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.sub_i64x4(a0, b0), self.sub_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        let (b0, b1) = self.split_u32x16(b);
-        self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
+    fn mul_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.mul_i64x4(a0, b0), self.mul_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
-        (
-            u32x8 {
-                val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
-                simd: self,
-            },
-            u32x8 {
-                val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
-                simd: self,
-            },
-        )
+    fn and_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.and_i64x4(a0, b0), self.and_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
-        let (chunks, []) = src.as_chunks::<4usize>() else {
-            unreachable!()
-        };
-        let v0: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[0]);
-        let v1: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[1]);
-        let v2: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[2]);
-        let v3: v128 = crate::transmute::checked_transmute_copy::<[u32; 4usize], v128>(&chunks[3]);
-        let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1);
-        let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3);
-        let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1);
-        let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3);
-        let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower);
-        let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower);
-        let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper);
-        let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper);
-        let combined_lower = self.combine_u32x4(out0.simd_into(self), out1.simd_into(self));
-        let combined_upper = self.combine_u32x4(out2.simd_into(self), out3.simd_into(self));
-        self.combine_u32x8(combined_lower, combined_upper)
+    fn or_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.or_i64x4(a0, b0), self.or_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        let (lower, upper) = self.split_u32x16(a);
-        let (v0_vec, v1_vec) = self.split_u32x8(lower);
-        let (v2_vec, v3_vec) = self.split_u32x8(upper);
-        let v0: v128 = v0_vec.into();
-        let v1: v128 = v1_vec.into();
-        let v2: v128 = v2_vec.into();
-        let v3: v128 = v3_vec.into();
-        let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2);
-        let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3);
-        let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2);
-        let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3);
-        let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower);
-        let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
-        let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
-        let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
-        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
-            unreachable!()
-        };
-        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out0, &mut chunks[0]);
-        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out1, &mut chunks[1]);
-        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out2, &mut chunks[2]);
-        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out3, &mut chunks[3]);
+    fn xor_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.xor_i64x4(a0, b0), self.xor_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
+    fn not_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.not_i64x4(a0), self.not_i64x4(a1))
     }
     #[inline(always)]
-    fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_u32x16(a);
-        self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
+    fn shl_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shl_i64x4(a0, shift), self.shl_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn splat_mask32x16(self, val: bool) -> mask32x16<Self> {
-        let half = self.splat_mask32x8(val);
-        self.combine_mask32x8(half, half)
+    fn shlv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shlv_i64x4(a0, b0), self.shlv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
-        mask32x16 {
-            val: crate::transmute::checked_transmute_copy(&val),
-            simd: self,
-        }
+    fn shr_i64x8(self, a: i64x8<Self>, shift: u32) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.shr_i64x4(a0, shift), self.shr_i64x4(a1, shift))
     }
     #[inline(always)]
-    fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [i32; 16usize]>(&a.val.0)
+    fn shrv_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.shrv_i64x4(a0, b0), self.shrv_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
-        let lo = self.from_bitmask_mask32x8(bits);
-        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
-        self.combine_mask32x8(lo, hi)
+    fn simd_eq_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_eq_i64x4(a0, b0), self.simd_eq_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
-        let (lo, hi) = self.split_mask32x16(a);
-        let lo = self.to_bitmask_mask32x8(lo);
-        let hi = self.to_bitmask_mask32x8(hi);
-        lo | (hi << 8usize)
+    fn simd_lt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_lt_i64x4(a0, b0), self.simd_lt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn set_mask32x16(self, a: &mut mask32x16<Self>, index: usize, value: bool) -> () {
-        assert!(
-            index < 16usize,
-            "mask lane index {index} is out of bounds for {} lanes",
-            16usize
-        );
-        let mut lanes = self.as_array_mask32x16(*a);
-        lanes[index] = if value { !0 } else { 0 };
-        *a = self.load_array_mask32x16(lanes);
+    fn simd_le_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_le_i64x4(a0, b0), self.simd_le_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
+    fn simd_ge_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_ge_i64x4(a0, b0), self.simd_ge_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
+    fn simd_gt_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_mask64x4(self.simd_gt_i64x4(a0, b0), self.simd_gt_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
+    fn zip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, _) = self.split_i64x8(a);
+        let (b0, _) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a0, b0), self.zip_high_i64x4(a0, b0))
     }
     #[inline(always)]
-    fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
+    fn zip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (_, a1) = self.split_i64x8(a);
+        let (_, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.zip_low_i64x4(a1, b1), self.zip_high_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_mask32x16(
-        self,
-        a: mask32x16<Self>,
-        b: mask32x16<Self>,
-        c: mask32x16<Self>,
-    ) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        let (c0, c1) = self.split_mask32x16(c);
-        self.combine_mask32x8(
-            self.select_mask32x8(a0, b0, c0),
-            self.select_mask32x8(a1, b1, c1),
-        )
+    fn unzip_low_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_low_i64x4(a0, a1), self.unzip_low_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
-        let (a0, a1) = self.split_mask32x16(a);
-        let (b0, b1) = self.split_mask32x16(b);
-        self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
+    fn unzip_high_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.unzip_high_i64x4(a0, a1), self.unzip_high_i64x4(b0, b1))
     }
     #[inline(always)]
-    fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
+    fn interleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_lo = self.zip_low_i64x4(a0, b0);
+        let lo_hi = self.zip_high_i64x4(a0, b0);
+        let hi_lo = self.zip_low_i64x4(a1, b1);
+        let hi_hi = self.zip_high_i64x4(a1, b1);
+        (
+            self.combine_i64x4(lo_lo, lo_hi),
+            self.combine_i64x4(hi_lo, hi_hi),
+        )
     }
     #[inline(always)]
-    fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
+    fn deinterleave_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> (i64x8<Self>, i64x8<Self>) {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let lo_even = self.unzip_low_i64x4(a0, a1);
+        let lo_odd = self.unzip_high_i64x4(a0, a1);
+        let hi_even = self.unzip_low_i64x4(b0, b1);
+        let hi_odd = self.unzip_high_i64x4(b0, b1);
+        (
+            self.combine_i64x4(lo_even, hi_even),
+            self.combine_i64x4(lo_odd, hi_odd),
+        )
     }
     #[inline(always)]
-    fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
+    fn select_i64x8(self, a: mask64x8<Self>, b: i64x8<Self>, c: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        let (c0, c1) = self.split_i64x8(c);
+        self.combine_i64x4(self.select_i64x4(a0, b0, c0), self.select_i64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
-        let (a0, a1) = self.split_mask32x16(a);
-        self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
+    fn min_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.min_i64x4(a0, b0), self.min_i64x4(a1, b1))
+    }
+    #[inline(always)]
+    fn max_i64x8(self, a: i64x8<Self>, b: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        let (b0, b1) = self.split_i64x8(b);
+        self.combine_i64x4(self.max_i64x4(a0, b0), self.max_i64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
+    fn split_i64x8(self, a: i64x8<Self>) -> (i64x4<Self>, i64x4<Self>) {
         (
-            mask32x8 {
+            i64x4 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            mask32x8 {
+            i64x4 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn splat_f64x8(self, val: f64) -> f64x8<Self> {
-        let half = self.splat_f64x4(val);
-        self.combine_f64x4(half, half)
+    fn neg_i64x8(self, a: i64x8<Self>) -> i64x8<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_i64x4(self.neg_i64x4(a0), self.neg_i64x4(a1))
     }
     #[inline(always)]
-    fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn reinterpret_u8_i64x8(self, a: i64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_i64x4(a0), self.reinterpret_u8_i64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_i64x8(self, a: i64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_i64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_i64x4(a0),
+            self.reinterpret_u32_i64x4(a1),
+        )
+    }
+    #[inline(always)]
+    fn splat_u64x8(self, val: u64) -> u64x8<Self> {
+        let half = self.splat_u64x4(val);
+        self.combine_u64x4(half, half)
+    }
+    #[inline(always)]
+    fn load_array_u64x8(self, val: [u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
-        f64x8 {
+    fn load_array_ref_u64x8(self, val: &[u64; 8usize]) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
-        crate::transmute::checked_transmute_copy::<[v128; 4usize], [f64; 8usize]>(&a.val.0)
+    fn as_array_u64x8(self, a: u64x8<Self>) -> [u64; 8usize] {
+        crate::transmute::checked_transmute_copy::<[v128; 4usize], [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
-        crate::transmute::checked_cast_ref::<[v128; 4usize], [f64; 8usize]>(&a.val.0)
+    fn as_array_ref_u64x8(self, a: &u64x8<Self>) -> &[u64; 8usize] {
+        crate::transmute::checked_cast_ref::<[v128; 4usize], [u64; 8usize]>(&a.val.0)
     }
     #[inline(always)]
-    fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
-        crate::transmute::checked_cast_mut::<[v128; 4usize], [f64; 8usize]>(&mut a.val.0)
+    fn as_array_mut_u64x8(self, a: &mut u64x8<Self>) -> &mut [u64; 8usize] {
+        crate::transmute::checked_cast_mut::<[v128; 4usize], [u64; 8usize]>(&mut a.val.0)
     }
     #[inline(always)]
-    fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
+    fn store_array_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
         crate::transmute::checked_transmute_store(a.val.0, dest);
     }
     #[inline(always)]
-    fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
-        f64x8 {
+    fn cvt_from_bytes_u64x8(self, a: u8x64<Self>) -> u64x8<Self> {
+        u64x8 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
+    fn cvt_to_bytes_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
         u8x64 {
             val: crate::transmute::checked_transmute_copy(&a.val),
             simd: self,
         }
     }
     #[inline(always)]
-    fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
+    fn slide_u64x8<const SHIFT: usize>(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
         if SHIFT >= 8usize {
             return b;
         }
         let result = cross_block_slide_128x4(
-            self.cvt_to_bytes_f64x8(a).val.0,
-            self.cvt_to_bytes_f64x8(b).val.0,
+            self.cvt_to_bytes_u64x8(a).val.0,
+            self.cvt_to_bytes_u64x8(b).val.0,
             SHIFT * 8usize,
         );
-        self.cvt_from_bytes_f64x8(u8x64 {
+        self.cvt_from_bytes_u64x8(u8x64 {
             val: crate::support::Aligned512(result),
             simd: self,
         })
     }
     #[inline(always)]
-    fn slide_within_blocks_f64x8<const SHIFT: usize>(
+    fn slide_within_blocks_u64x8<const SHIFT: usize>(
         self,
-        a: f64x8<Self>,
-        b: f64x8<Self>,
-    ) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
-            self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
-        )
-    }
-    #[inline(always)]
-    fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
-    }
-    #[inline(always)]
-    fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
-    }
-    #[inline(always)]
-    fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
-    }
-    #[inline(always)]
-    fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.approximate_recip_f64x4(a0),
-            self.approximate_recip_f64x4(a1),
+        a: u64x8<Self>,
+        b: u64x8<Self>,
+    ) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(
+            self.slide_within_blocks_u64x4::<SHIFT>(a0, b0),
+            self.slide_within_blocks_u64x4::<SHIFT>(a1, b1),
         )
     }
     #[inline(always)]
-    fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
-    }
-    #[inline(always)]
-    fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
+    fn add_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.add_u64x4(a0, b0), self.add_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
+    fn sub_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.sub_u64x4(a0, b0), self.sub_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
+    fn mul_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.mul_u64x4(a0, b0), self.mul_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
+    fn and_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.and_u64x4(a0, b0), self.and_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
+    fn or_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.or_u64x4(a0, b0), self.or_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
+    fn xor_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.xor_u64x4(a0, b0), self.xor_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
+    fn not_u64x8(self, a: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.not_u64x4(a0), self.not_u64x4(a1))
     }
     #[inline(always)]
-    fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
+    fn shl_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shl_u64x4(a0, shift), self.shl_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
+    fn shlv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shlv_u64x4(a0, b0), self.shlv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, _) = self.split_f64x8(a);
-        let (b0, _) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
+    fn shr_u64x8(self, a: u64x8<Self>, shift: u32) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u64x4(self.shr_u64x4(a0, shift), self.shr_u64x4(a1, shift))
     }
     #[inline(always)]
-    fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (_, a1) = self.split_f64x8(a);
-        let (_, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
+    fn shrv_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.shrv_u64x4(a0, b0), self.shrv_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
+    fn simd_eq_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_eq_u64x4(a0, b0), self.simd_eq_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
+    fn simd_lt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_lt_u64x4(a0, b0), self.simd_lt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn interleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_lo = self.zip_low_f64x4(a0, b0);
-        let lo_hi = self.zip_high_f64x4(a0, b0);
-        let hi_lo = self.zip_low_f64x4(a1, b1);
-        let hi_hi = self.zip_high_f64x4(a1, b1);
-        (
-            self.combine_f64x4(lo_lo, lo_hi),
-            self.combine_f64x4(hi_lo, hi_hi),
-        )
+    fn simd_le_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_le_u64x4(a0, b0), self.simd_le_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn deinterleave_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> (f64x8<Self>, f64x8<Self>) {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let lo_even = self.unzip_low_f64x4(a0, a1);
-        let lo_odd = self.unzip_high_f64x4(a0, a1);
-        let hi_even = self.unzip_low_f64x4(b0, b1);
-        let hi_odd = self.unzip_high_f64x4(b0, b1);
-        (
-            self.combine_f64x4(lo_even, hi_even),
-            self.combine_f64x4(lo_odd, hi_odd),
-        )
+    fn simd_ge_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_ge_u64x4(a0, b0), self.simd_ge_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
+    fn simd_gt_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> mask64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_mask64x4(self.simd_gt_u64x4(a0, b0), self.simd_gt_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
+    fn zip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, _) = self.split_u64x8(a);
+        let (b0, _) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a0, b0), self.zip_high_u64x4(a0, b0))
     }
     #[inline(always)]
-    fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.max_precise_f64x4(a0, b0),
-            self.max_precise_f64x4(a1, b1),
-        )
+    fn zip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (_, a1) = self.split_u64x8(a);
+        let (_, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.zip_low_u64x4(a1, b1), self.zip_high_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        self.combine_f64x4(
-            self.min_precise_f64x4(a0, b0),
-            self.min_precise_f64x4(a1, b1),
-        )
+    fn unzip_low_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_low_u64x4(a0, a1), self.unzip_low_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_add_f64x4(a0, b0, c0),
-            self.mul_add_f64x4(a1, b1, c1),
-        )
+    fn unzip_high_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.unzip_high_u64x4(a0, a1), self.unzip_high_u64x4(b0, b1))
     }
     #[inline(always)]
-    fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(
-            self.mul_sub_f64x4(a0, b0, c0),
-            self.mul_sub_f64x4(a1, b1, c1),
+    fn interleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_lo = self.zip_low_u64x4(a0, b0);
+        let lo_hi = self.zip_high_u64x4(a0, b0);
+        let hi_lo = self.zip_low_u64x4(a1, b1);
+        let hi_hi = self.zip_high_u64x4(a1, b1);
+        (
+            self.combine_u64x4(lo_lo, lo_hi),
+            self.combine_u64x4(hi_lo, hi_hi),
         )
     }
     #[inline(always)]
-    fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
-    }
-    #[inline(always)]
-    fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
-    }
-    #[inline(always)]
-    fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(
-            self.round_ties_even_f64x4(a0),
-            self.round_ties_even_f64x4(a1),
+    fn deinterleave_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> (u64x8<Self>, u64x8<Self>) {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let lo_even = self.unzip_low_u64x4(a0, a1);
+        let lo_odd = self.unzip_high_u64x4(a0, a1);
+        let hi_even = self.unzip_low_u64x4(b0, b1);
+        let hi_odd = self.unzip_high_u64x4(b0, b1);
+        (
+            self.combine_u64x4(lo_even, hi_even),
+            self.combine_u64x4(lo_odd, hi_odd),
         )
     }
     #[inline(always)]
-    fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
+    fn select_u64x8(self, a: mask64x8<Self>, b: u64x8<Self>, c: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_mask64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        let (c0, c1) = self.split_u64x8(c);
+        self.combine_u64x4(self.select_u64x4(a0, b0, c0), self.select_u64x4(a1, b1, c1))
     }
     #[inline(always)]
-    fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
+    fn min_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.min_u64x4(a0, b0), self.min_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
-        let (a0, a1) = self.split_mask64x8(a);
-        let (b0, b1) = self.split_f64x8(b);
-        let (c0, c1) = self.split_f64x8(c);
-        self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
+    fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        let (b0, b1) = self.split_u64x8(b);
+        self.combine_u64x4(self.max_u64x4(a0, b0), self.max_u64x4(a1, b1))
     }
     #[inline(always)]
-    fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
+    fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>) {
         (
-            f64x4 {
+            u64x4 {
                 val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
                 simd: self,
             },
-            f64x4 {
+            u64x4 {
                 val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
                 simd: self,
             },
         )
     }
     #[inline(always)]
-    fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
-        let (a0, a1) = self.split_f64x8(a);
-        self.combine_f32x8(
-            self.reinterpret_f32_f64x4(a0),
-            self.reinterpret_f32_f64x4(a1),
+    fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self> {
+        let (chunks, []) = src.as_chunks::<2usize>() else {
+            unreachable!()
+        };
+        let v0: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[0]);
+        let v1: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[1]);
+        let v2: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[2]);
+        let v3: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[3]);
+        let v01_lower = u64x2_shuffle::<0, 2>(v0, v1);
+        let v23_lower = u64x2_shuffle::<0, 2>(v2, v3);
+        let v01_upper = u64x2_shuffle::<1, 3>(v0, v1);
+        let v23_upper = u64x2_shuffle::<1, 3>(v2, v3);
+        let out0 = u64x2_shuffle::<0, 1>(v01_lower, v23_lower);
+        let out1 = u64x2_shuffle::<2, 3>(v01_lower, v23_lower);
+        let out2 = u64x2_shuffle::<0, 1>(v01_upper, v23_upper);
+        let out3 = u64x2_shuffle::<2, 3>(v01_upper, v23_upper);
+        let combined_lower = self.combine_u64x2(out0.simd_into(self), out1.simd_into(self));
+        let combined_upper = self.combine_u64x2(out2.simd_into(self), out3.simd_into(self));
+        self.combine_u64x4(combined_lower, combined_upper)
+    }
+    #[inline(always)]
+    fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
+        let (lower, upper) = self.split_u64x8(a);
+        let (v0_vec, v1_vec) = self.split_u64x4(lower);
+        let (v2_vec, v3_vec) = self.split_u64x4(upper);
+        let v0: v128 = v0_vec.into();
+        let v1: v128 = v1_vec.into();
+        let v2: v128 = v2_vec.into();
+        let v3: v128 = v3_vec.into();
+        let out0 = u64x2_shuffle::<0, 2>(v0, v2);
+        let out1 = u64x2_shuffle::<1, 3>(v0, v2);
+        let out2 = u64x2_shuffle::<0, 2>(v1, v3);
+        let out3 = u64x2_shuffle::<1, 3>(v1, v3);
+        let (chunks, []) = dest.as_chunks_mut::<2usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u64; 2usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u64; 2usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u64; 2usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u64; 2usize]>(out3, &mut chunks[3]);
+    }
+    #[inline(always)]
+    fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u8x32(self.reinterpret_u8_u64x4(a0), self.reinterpret_u8_u64x4(a1))
+    }
+    #[inline(always)]
+    fn reinterpret_u32_u64x8(self, a: u64x8<Self>) -> u32x16<Self> {
+        let (a0, a1) = self.split_u64x8(a);
+        self.combine_u32x8(
+            self.reinterpret_u32_u64x4(a0),
+            self.reinterpret_u32_u64x4(a1),
         )
     }
     #[inline(always)]
@@ -8225,6 +10025,36 @@ impl<S: Simd> From<f64x2<S>> for v128 {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
+impl<S: Simd> SimdFrom<v128, S> for i64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: v128) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<i64x2<S>> for v128 {
+    #[inline(always)]
+    fn from(value: i64x2<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
+impl<S: Simd> SimdFrom<v128, S> for u64x2<S> {
+    #[inline(always)]
+    fn simd_from(simd: S, arch: v128) -> Self {
+        Self {
+            val: crate::transmute::checked_transmute_copy(&arch),
+            simd,
+        }
+    }
+}
+impl<S: Simd> From<u64x2<S>> for v128 {
+    #[inline(always)]
+    fn from(value: u64x2<S>) -> Self {
+        crate::transmute::checked_transmute_copy(&value.val)
+    }
+}
 impl<S: Simd> SimdFrom<v128, S> for mask64x2<S> {
     #[inline(always)]
     fn simd_from(simd: S, arch: v128) -> Self {
diff --git a/fearless_simd/src/traits.rs b/fearless_simd/src/traits.rs
index e51b0a5fb..77b63f5f2 100644
--- a/fearless_simd/src/traits.rs
+++ b/fearless_simd/src/traits.rs
@@ -66,6 +66,7 @@ impl Seal for u16 {}
 impl Seal for i16 {}
 impl Seal for u32 {}
 impl Seal for i32 {}
+impl Seal for u64 {}
 impl Seal for i64 {}
 
 /// Value conversion, adding a SIMD blessing.
@@ -141,6 +142,10 @@ impl SimdElement for i32 {
     type Mask = Self;
 }
 
+impl SimdElement for u64 {
+    type Mask = i64;
+}
+
 impl SimdElement for i64 {
     type Mask = Self;
 }
diff --git a/fearless_simd/src/transmute.rs b/fearless_simd/src/transmute.rs
index 2c6b30b22..7baad51a6 100644
--- a/fearless_simd/src/transmute.rs
+++ b/fearless_simd/src/transmute.rs
@@ -26,7 +26,7 @@ use core::arch::aarch64::{
     int8x16_t, int8x16x2_t, int8x16x4_t, int16x8_t, int16x8x2_t, int16x8x4_t, int32x4_t,
     int32x4x2_t, int32x4x4_t, int64x2_t, int64x2x2_t, int64x2x4_t, uint8x16_t, uint8x16x2_t,
     uint8x16x4_t, uint16x8_t, uint16x8x2_t, uint16x8x4_t, uint32x4_t, uint32x4x2_t, uint32x4x4_t,
-    uint64x2_t,
+    uint64x2_t, uint64x2x2_t, uint64x2x4_t,
 };
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
 use core::arch::wasm32::v128;
@@ -102,6 +102,7 @@ impl_aligned_simd_pod!(
     Aligned128<[u8; 16]>,
     Aligned128<[u16; 8]>,
     Aligned128<[u32; 4]>,
+    Aligned128<[u64; 2]>,
     Aligned256<[f32; 8]>,
     Aligned256<[f64; 4]>,
     Aligned256<[i8; 32]>,
@@ -111,6 +112,7 @@ impl_aligned_simd_pod!(
     Aligned256<[u8; 32]>,
     Aligned256<[u16; 16]>,
     Aligned256<[u32; 8]>,
+    Aligned256<[u64; 4]>,
     Aligned512<[f32; 16]>,
     Aligned512<[f64; 8]>,
     Aligned512<[i8; 64]>,
@@ -120,6 +122,7 @@ impl_aligned_simd_pod!(
     Aligned512<[u8; 64]>,
     Aligned512<[u16; 32]>,
     Aligned512<[u32; 16]>,
+    Aligned512<[u64; 8]>,
 );
 
 // the `const` is just to only use a single cfg annotation, nothing to do with const evaluation
@@ -179,6 +182,8 @@ const _: () = {
     unsafe impl SimdPod for uint32x4x2_t {}
     unsafe impl SimdPod for uint32x4x4_t {}
     unsafe impl SimdPod for uint64x2_t {}
+    unsafe impl SimdPod for uint64x2x2_t {}
+    unsafe impl SimdPod for uint64x2x4_t {}
 };
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -221,6 +226,7 @@ impl_aligned_simd_pod!(
     Aligned128<uint8x16_t>,
     Aligned128<uint16x8_t>,
     Aligned128<uint32x4_t>,
+    Aligned128<uint64x2_t>,
     Aligned256<float32x4x2_t>,
     Aligned256<float64x2x2_t>,
     Aligned256<int8x16x2_t>,
@@ -230,6 +236,7 @@ impl_aligned_simd_pod!(
     Aligned256<uint8x16x2_t>,
     Aligned256<uint16x8x2_t>,
     Aligned256<uint32x4x2_t>,
+    Aligned256<uint64x2x2_t>,
     Aligned512<float32x4x4_t>,
     Aligned512<float64x2x4_t>,
     Aligned512<int8x16x4_t>,
@@ -239,6 +246,7 @@ impl_aligned_simd_pod!(
     Aligned512<uint8x16x4_t>,
     Aligned512<uint16x8x4_t>,
     Aligned512<uint32x4x4_t>,
+    Aligned512<uint64x2x4_t>,
 );
 
 /// Like [`core::mem::transmute_copy`], but statically rejects differently-sized
diff --git a/fearless_simd_gen/src/arch/neon.rs b/fearless_simd_gen/src/arch/neon.rs
index f32a4f3e2..b0a11cf96 100644
--- a/fearless_simd_gen/src/arch/neon.rs
+++ b/fearless_simd_gen/src/arch/neon.rs
@@ -44,8 +44,19 @@ fn translate_op(op: &str) -> Option<&'static str> {
 // expects args and return value in arch dialect
 pub(crate) fn expr(op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream {
     // There is no logical NOT for 64-bit, so we need this workaround.
-    if op == "not" && ty.scalar_bits == 64 && ty.scalar == ScalarType::Mask {
-        return quote! { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))) };
+    if op == "not" && ty.scalar_bits == 64 {
+        let a = &args[0];
+        return match ty.scalar {
+            ScalarType::Int | ScalarType::Mask => {
+                quote! { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(#a))) }
+            }
+            ScalarType::Unsigned => {
+                quote! { vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(#a))) }
+            }
+            ScalarType::Float => {
+                unreachable!("64-bit floating point vectors do not support logical NOT")
+            }
+        };
     }
 
     if let Some(xlat) = translate_op(op) {
diff --git a/fearless_simd_gen/src/arch/x86.rs b/fearless_simd_gen/src/arch/x86.rs
index 43dd8b5a5..b09715657 100644
--- a/fearless_simd_gen/src/arch/x86.rs
+++ b/fearless_simd_gen/src/arch/x86.rs
@@ -171,9 +171,10 @@ pub(crate) fn coarse_type(vec_ty: &VecType) -> &'static str {
 
 pub(crate) fn set1_intrinsic(vec_ty: &VecType) -> Ident {
     use ScalarType::*;
-    let suffix = match (vec_ty.scalar, vec_ty.scalar_bits) {
-        (Int | Unsigned | Mask, 64) => "epi64x",
-        (scalar, bits) => op_suffix(scalar, bits, false),
+    let suffix = match (vec_ty.scalar, vec_ty.scalar_bits, vec_ty.n_bits()) {
+        (Int | Unsigned | Mask, 64, 512) => "epi64",
+        (Int | Unsigned | Mask, 64, _) => "epi64x",
+        (scalar, bits, _) => op_suffix(scalar, bits, false),
     };
 
     intrinsic_ident("set1", suffix, vec_ty.n_bits())
diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index 09cc9c1bc..f75061b8b 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -312,8 +312,81 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream {
     }
 }
 
-pub(crate) fn scalar_binary(f: TokenStream) -> TokenStream {
-    quote! { core::array::from_fn(|i| #f(a[i], b[i])).simd_into(self) }
+pub(crate) fn unrolled_array(
+    len: usize,
+    mut item: impl FnMut(usize) -> TokenStream,
+) -> TokenStream {
+    let items = (0..len).map(|idx| item(idx)).collect::<Vec<_>>();
+    quote! { [#(#items),*] }
+}
+
+pub(crate) fn scalar_binary(f: TokenStream, vec_ty: &VecType, simd: impl ToTokens) -> TokenStream {
+    let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits);
+    let len = vec_ty.len;
+    let items = unrolled_array(len, |idx| quote! { #f(a[#idx], b[#idx]) });
+
+    quote! {
+        let a: [#scalar; #len] = a.into();
+        let b: [#scalar; #len] = b.into();
+        let result: [#scalar; #len] = #items;
+        result.simd_into(#simd)
+    }
+}
+
+pub(crate) fn scalar_binary_method(
+    method: &str,
+    vec_ty: &VecType,
+    simd: impl ToTokens,
+) -> TokenStream {
+    let method = Ident::new(method, Span::call_site());
+    let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits);
+    let len = vec_ty.len;
+    let items = unrolled_array(len, |idx| quote! { a[#idx].#method(b[#idx]) });
+
+    quote! {
+        let a: [#scalar; #len] = a.into();
+        let b: [#scalar; #len] = b.into();
+        let result: [#scalar; #len] = #items;
+        result.simd_into(#simd)
+    }
+}
+
+pub(crate) fn scalar_shift(f: TokenStream, vec_ty: &VecType, simd: impl ToTokens) -> TokenStream {
+    let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits);
+    let len = vec_ty.len;
+    let items = unrolled_array(len, |idx| quote! { #f(a[#idx], shift) });
+
+    quote! {
+        let a: [#scalar; #len] = a.into();
+        let result: [#scalar; #len] = #items;
+        result.simd_into(#simd)
+    }
+}
+
+pub(crate) fn scalar_compare(method: &str, vec_ty: &VecType, simd: impl ToTokens) -> TokenStream {
+    let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits);
+    let mask_scalar = ScalarType::Mask.rust(vec_ty.scalar_bits);
+    let len = vec_ty.len;
+    let op = match method {
+        "simd_eq" => quote! { == },
+        "simd_lt" => quote! { < },
+        "simd_le" => quote! { <= },
+        "simd_ge" => quote! { >= },
+        "simd_gt" => quote! { > },
+        _ => unreachable!("unsupported scalar comparison: {method}"),
+    };
+    let items = unrolled_array(len, |idx| {
+        quote! { if a[#idx] #op b[#idx] { true_lane } else { false_lane } }
+    });
+
+    quote! {
+        let a: [#scalar; #len] = a.into();
+        let b: [#scalar; #len] = b.into();
+        let true_lane: #mask_scalar = !0;
+        let false_lane: #mask_scalar = 0;
+        let result: [#mask_scalar; #len] = #items;
+        result.simd_into(#simd)
+    }
 }
 
 pub(crate) fn generic_block_split(
@@ -468,11 +541,13 @@ pub(crate) fn generic_from_bytes(method_sig: TokenStream, vec_ty: &VecType) -> T
 pub(crate) fn generic_mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
     let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits);
     let len = vec_ty.len;
+    let lanes = unrolled_array(len, |idx| {
+        quote! { if ((bits >> #idx) & 1) != 0 { !0 } else { 0 } }
+    });
 
     quote! {
         #method_sig {
-            let lanes: [#scalar; #len] =
-                core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+            let lanes: [#scalar; #len] = #lanes;
             lanes.simd_into(self)
         }
     }
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index 63b37dc7c..3680a2cdd 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -176,6 +176,8 @@ pub(crate) trait Level {
             (ScalarType::Int, 16),
             (ScalarType::Unsigned, 32),
             (ScalarType::Int, 32),
+            (ScalarType::Unsigned, 64),
+            (ScalarType::Int, 64),
             (ScalarType::Mask, 8),
             (ScalarType::Mask, 16),
             (ScalarType::Mask, 32),
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index 92099258a..810baa9e1 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -472,8 +472,12 @@ impl Level for Fallback {
                 block_count,
             } => {
                 let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
-                let items =
-                    interleave_indices(len, block_count as usize, |idx| quote! { src[#idx] });
+                let stride = if vec_ty.scalar_bits == 64 {
+                    len / block_count as usize
+                } else {
+                    block_count as usize
+                };
+                let items = interleave_indices(len, stride, |idx| quote! { src[#idx] });
 
                 quote! {
                     #method_sig {
@@ -486,8 +490,12 @@ impl Level for Fallback {
                 block_count,
             } => {
                 let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
-                let items =
-                    interleave_indices(len, len / block_count as usize, |idx| quote! { a[#idx] });
+                let stride = if vec_ty.scalar_bits == 64 {
+                    block_count as usize
+                } else {
+                    len / block_count as usize
+                };
+                let items = interleave_indices(len, stride, |idx| quote! { a[#idx] });
 
                 quote! {
                     #method_sig {
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 2cdf737f4..a70f44d08 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -6,7 +6,7 @@ use quote::{ToTokens as _, format_ident, quote};
 
 use crate::generic::{
     generic_as_array, generic_from_array, generic_from_bytes, generic_mask_set, generic_op_name,
-    generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary_method,
 };
 use crate::level::Level;
 use crate::ops::{Op, SlideGranularity, valid_reinterpret};
@@ -204,6 +204,18 @@ impl Level for Neon {
                 }
             }
             OpSig::Binary => self.kernel_method(op, vec_ty, |token| match method {
+                "mul"
+                    if vec_ty.scalar_bits == 64
+                        && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) =>
+                {
+                    scalar_binary_method("wrapping_mul", vec_ty, token)
+                }
+                "min" | "max"
+                    if vec_ty.scalar_bits == 64
+                        && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) =>
+                {
+                    scalar_binary_method(method, vec_ty, token)
+                }
                 "shlv" | "shrv" => {
                     let mut args = if vec_ty.scalar == ScalarType::Int {
                         // Signed case
diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
index 99fb91a76..cbfe9e8ab 100644
--- a/fearless_simd_gen/src/mk_simd_trait.rs
+++ b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -68,7 +68,7 @@ pub(crate) fn mk_simd_trait() -> TokenStream {
             /// A native-width SIMD vector of [`f32`]s.
             type f32s: SimdFloat<Self, Element = f32, Block = f32x4<Self>, Mask = Self::mask32s, Bytes = <Self::u32s as Bytes>::Bytes> + SimdCvtFloat<Self::u32s> + SimdCvtFloat<Self::i32s>;
             /// A native-width SIMD vector of [`f64`]s.
-            type f64s: SimdFloat<Self, Element = f64, Block = f64x2<Self>, Mask = Self::mask64s>;
+            type f64s: SimdFloat<Self, Element = f64, Block = f64x2<Self>, Mask = Self::mask64s, Bytes = <Self::u64s as Bytes>::Bytes>;
             /// A native-width SIMD vector of [`u8`]s.
             type u8s: SimdInt<Self, Element = u8, Block = u8x16<Self>, Mask = Self::mask8s>;
             /// A native-width SIMD vector of [`i8`]s.
@@ -82,6 +82,11 @@ pub(crate) fn mk_simd_trait() -> TokenStream {
             /// A native-width SIMD vector of [`i32`]s.
             type i32s: SimdInt<Self, Element = i32, Block = i32x4<Self>, Mask = Self::mask32s, Bytes = <Self::u32s as Bytes>::Bytes> + SimdCvtTruncate<Self::f32s>
                 + core::ops::Neg<Output = Self::i32s>;
+            /// A native-width SIMD vector of [`u64`]s.
+            type u64s: SimdInt<Self, Element = u64, Block = u64x2<Self>, Mask = Self::mask64s>;
+            /// A native-width SIMD vector of [`i64`]s.
+            type i64s: SimdInt<Self, Element = i64, Block = i64x2<Self>, Mask = Self::mask64s, Bytes = <Self::u64s as Bytes>::Bytes>
+                + core::ops::Neg<Output = Self::i64s>;
             /// A native-width SIMD mask with 8-bit lanes.
             type mask8s: SimdMask<Self, Element = i8> + Select<Self::u8s> + Select<Self::i8s> + Select<Self::mask8s>;
             /// A native-width SIMD mask with 16-bit lanes.
@@ -89,7 +94,7 @@ pub(crate) fn mk_simd_trait() -> TokenStream {
             /// A native-width SIMD mask with 32-bit lanes.
             type mask32s: SimdMask<Self, Element = i32> + Select<Self::f32s> + Select<Self::u32s> + Select<Self::i32s> + Select<Self::mask32s>;
             /// A native-width SIMD mask with 64-bit lanes.
-            type mask64s: SimdMask<Self, Element = i64> + Select<Self::f64s> + Select<Self::mask64s>;
+            type mask64s: SimdMask<Self, Element = i64> + Select<Self::f64s> + Select<Self::u64s> + Select<Self::i64s> + Select<Self::mask64s>;
 
             /// This SIMD token's feature level.
             fn level(self) -> Level;
diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs
index b6f2aafce..7c66796d4 100644
--- a/fearless_simd_gen/src/mk_simd_types.rs
+++ b/fearless_simd_gen/src/mk_simd_types.rs
@@ -5,7 +5,7 @@ use proc_macro2::{Ident, Literal, Span, TokenStream};
 use quote::{format_ident, quote};
 
 use crate::{
-    generic::generic_op_name,
+    generic::{generic_op_name, unrolled_array},
     ops::{
         F32_TO_I32, F32_TO_I32_PRECISE, F32_TO_U32, F32_TO_U32_PRECISE, I32_TO_F32, Op, OpSig,
         TyFlavor, U32_TO_F32, vec_trait_ops_for,
@@ -374,6 +374,7 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream {
     let name = ty.rust();
     let scalar = ty.scalar.rust(ty.scalar_bits);
     let len = Literal::usize_unsuffixed(ty.len);
+    let from_fn_items = unrolled_array(ty.len, |idx| quote! { f(#idx) });
     let vec_trait = match ty.scalar {
         ScalarType::Float => "SimdFloat",
         ScalarType::Unsigned | ScalarType::Int => "SimdInt",
@@ -473,8 +474,8 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream {
             }
 
             #[inline(always)]
-            fn from_fn(simd: S, f: impl FnMut(usize) -> #scalar) -> Self {
-                simd.#from_array_op(core::array::from_fn(f))
+            fn from_fn(simd: S, mut f: impl FnMut(usize) -> #scalar) -> Self {
+                simd.#from_array_op(#from_fn_items)
             }
 
             #[inline(always)]
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index f62014bdd..11efb293b 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -8,7 +8,7 @@ use crate::arch::wasm::{arch_prefix, v128_intrinsic};
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
     generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
-    integer_lane_mask_splat_arg, scalar_binary,
+    integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare,
 };
 use crate::level::Level;
 use crate::ops::{Op, Quantifier, SlideGranularity, valid_reinterpret};
@@ -246,8 +246,14 @@ impl Level for WasmSimd128 {
                             { #expr.simd_into(self) }
                         }
                     }
-                    "shlv" => scalar_binary(quote!(core::ops::Shl::shl)),
-                    "shrv" => scalar_binary(quote!(core::ops::Shr::shr)),
+                    "min" | "max"
+                        if vec_ty.scalar_bits == 64
+                            && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned) =>
+                    {
+                        scalar_binary_method(method, vec_ty, quote! { self })
+                    }
+                    "shlv" => scalar_binary(quote!(core::ops::Shl::shl), vec_ty, quote! { self }),
+                    "shrv" => scalar_binary(quote!(core::ops::Shr::shr), vec_ty, quote! { self }),
                     "copysign" => {
                         let splat = simple_intrinsic("splat", vec_ty);
                         let sign_mask_literal = match vec_ty.scalar_bits {
@@ -306,11 +312,16 @@ impl Level for WasmSimd128 {
                 }
             }
             OpSig::Compare => {
-                let args = [quote! { a.into() }, quote! { b.into() }];
-                let expr = wasm::expr(method, vec_ty, &args);
+                let expr = if vec_ty.scalar == ScalarType::Unsigned && vec_ty.scalar_bits == 64 {
+                    scalar_compare(method, vec_ty, quote! { self })
+                } else {
+                    let args = [quote! { a.into() }, quote! { b.into() }];
+                    let expr = wasm::expr(method, vec_ty, &args);
+                    quote! { #expr.simd_into(self) }
+                };
                 quote! {
                     #method_sig {
-                        #expr.simd_into(self)
+                        #expr
                     }
                 }
             }
@@ -626,6 +637,13 @@ impl Level for WasmSimd128 {
                         quote! { 2, 3, 6, 7 },
                         quote! { u32x4_shuffle },
                     ),
+                    64 => (
+                        quote! { 0, 2 },
+                        quote! { 1, 3 },
+                        quote! { 0, 1 },
+                        quote! { 2, 3 },
+                        quote! { u64x2_shuffle },
+                    ),
                     _ => panic!("unsupported scalar_bits"),
                 };
 
@@ -686,6 +704,44 @@ impl Level for WasmSimd128 {
                 let elems_per_vec = block_size as usize / vec_ty.scalar_bits;
                 let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
 
+                if vec_ty.scalar_bits == 64 {
+                    let block_ty = vec_ty.block_ty();
+                    let block_ty_2x =
+                        VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2);
+                    let block_ty_4x =
+                        VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 4);
+
+                    let split_method = generic_op_name("split", &block_ty_2x);
+                    let split_method_2x = generic_op_name("split", &block_ty_4x);
+
+                    return quote! {
+                        #method_sig {
+                            let (lower, upper) = self.#split_method_2x(a);
+                            let (v0_vec, v1_vec) = self.#split_method(lower);
+                            let (v2_vec, v3_vec) = self.#split_method(upper);
+
+                            let v0: v128 = v0_vec.into();
+                            let v1: v128 = v1_vec.into();
+                            let v2: v128 = v2_vec.into();
+                            let v3: v128 = v3_vec.into();
+
+                            let out0 = u64x2_shuffle::<0, 2>(v0, v2);
+                            let out1 = u64x2_shuffle::<1, 3>(v0, v2);
+                            let out2 = u64x2_shuffle::<0, 2>(v1, v3);
+                            let out3 = u64x2_shuffle::<1, 3>(v1, v3);
+
+                            let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else {
+                                unreachable!()
+                            };
+
+                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out0, &mut chunks[0]);
+                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out1, &mut chunks[1]);
+                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out2, &mut chunks[2]);
+                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out3, &mut chunks[3]);
+                        }
+                    };
+                }
+
                 let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits {
                     8 => (
                         quote! { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 },
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 08ee3ac51..84f7a9a1d 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -9,7 +9,7 @@ use crate::arch::x86::{
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
     generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
-    integer_lane_mask_splat_arg, scalar_binary,
+    integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare, scalar_shift,
 };
 use crate::level::Level;
 use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret};
@@ -911,6 +911,20 @@ fn interleaved_store_indices(len: usize, block_count: usize) -> Vec<usize> {
         .collect()
 }
 
+fn interleaved_load_indices_64(len: usize, block_count: usize) -> Vec<usize> {
+    let stream_len = len / block_count;
+    (0..stream_len)
+        .flat_map(|i| (0..block_count).map(move |stream| stream * stream_len + i))
+        .collect()
+}
+
+fn interleaved_store_indices_64(len: usize, block_count: usize) -> Vec<usize> {
+    let stream_len = len / block_count;
+    (0..block_count)
+        .flat_map(|stream| (0..stream_len).map(move |i| i * block_count + stream))
+        .collect()
+}
+
 impl X86 {
     pub(crate) fn handle_splat(&self, op: Op, vec_ty: &VecType) -> TokenStream {
         if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
@@ -1198,6 +1212,13 @@ impl X86 {
             });
         }
 
+        if vec_ty.scalar_bits == 64
+            && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned)
+            && method != "simd_eq"
+        {
+            return self.kernel_method(op, vec_ty, |token| scalar_compare(method, vec_ty, token));
+        }
+
         let args = [quote! { a.into() }, quote! { b.into() }];
 
         let expr = if vec_ty.scalar != ScalarType::Float {
@@ -1566,7 +1587,36 @@ impl X86 {
             });
         }
 
+        if *self != Self::Avx512
+            && vec_ty.scalar_bits == 64
+            && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned)
+            && matches!(method, "mul" | "min" | "max")
+        {
+            let body = if method == "mul" {
+                scalar_binary_method("wrapping_mul", vec_ty, quote! { self })
+            } else {
+                scalar_binary_method(method, vec_ty, quote! { self })
+            };
+            return quote! {
+                #method_sig {
+                    #body
+                }
+            };
+        }
+
         match method {
+            "shrv"
+                if *self != Self::Avx512
+                    && vec_ty.scalar == ScalarType::Int
+                    && vec_ty.scalar_bits == 64 =>
+            {
+                let body = scalar_binary(quote!(core::ops::Shr::shr), vec_ty, quote! { self });
+                quote! {
+                    #method_sig {
+                        #body
+                    }
+                }
+            }
             "shlv" | "shrv"
                 if *self == Self::Avx512
                     && matches!(vec_ty.scalar, ScalarType::Int | ScalarType::Unsigned)
@@ -1581,8 +1631,8 @@ impl X86 {
             {
                 // SSE2 has shift operations, but they shift every lane by the same amount, so we can't use them here.
                 let body = match method {
-                    "shlv" => scalar_binary(quote!(core::ops::Shl::shl)),
-                    "shrv" => scalar_binary(quote!(core::ops::Shr::shr)),
+                    "shlv" => scalar_binary(quote!(core::ops::Shl::shl), vec_ty, quote! { self }),
+                    "shrv" => scalar_binary(quote!(core::ops::Shr::shr), vec_ty, quote! { self }),
                     _ => unreachable!(),
                 };
                 quote! {
@@ -1695,6 +1745,20 @@ impl X86 {
     }
 
     pub(crate) fn handle_shift(&self, op: Op, method: &str, vec_ty: &VecType) -> TokenStream {
+        let method_sig = op.simd_trait_method_sig(vec_ty);
+        if *self != Self::Avx512
+            && method == "shr"
+            && vec_ty.scalar == ScalarType::Int
+            && vec_ty.scalar_bits == 64
+        {
+            let body = scalar_shift(quote!(core::ops::Shr::shr), vec_ty, quote! { self });
+            return quote! {
+                #method_sig {
+                    #body
+                }
+            };
+        }
+
         let shift_op = match (method, vec_ty.scalar) {
             ("shr", ScalarType::Unsigned) => "srl",
             ("shr", ScalarType::Int) => "sra",
@@ -2275,6 +2339,14 @@ impl X86 {
 
                     quote! { #intrinsic::<#mask>(a.into(), b.into()).simd_into(#token) }
                 }
+                (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 64) => {
+                    let op = if select_even { "unpacklo" } else { "unpackhi" };
+                    let intrinsic = intrinsic_ident(op, "epi64", vec_ty.n_bits());
+
+                    quote! {
+                        #intrinsic(a.into(), b.into()).simd_into(#token)
+                    }
+                }
                 (ScalarType::Int | ScalarType::Mask | ScalarType::Unsigned, 128, 32) => {
                     // 128-bit shuffle of 32-bit integers; unlike with floats, there is no single shuffle instruction that
                     // combines two vectors
@@ -2867,6 +2939,55 @@ impl X86 {
             return self.handle_avx512_load_interleaved(op, vec_ty, block_size, block_count);
         }
         match vec_ty.scalar_bits {
+            64 => {
+                let block_ty =
+                    VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
+                let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
+                let native_ty = self.arch_ty(&block_ty);
+                let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty);
+                let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty);
+                let vec_combined =
+                    VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2);
+                let combine_half = Ident::new(
+                    &format!("combine_{}", block_ty.rust_name()),
+                    Span::call_site(),
+                );
+                let combine_full = Ident::new(
+                    &format!("combine_{}", vec_combined.rust_name()),
+                    Span::call_site(),
+                );
+                let block_len = block_size as usize / vec_ty.scalar_bits;
+
+                self.kernel_method(op, vec_ty, |token| {
+                    quote! {
+                        let (chunks, []) = src.as_chunks::<#block_len>() else {
+                            unreachable!()
+                        };
+                        let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[0],
+                        );
+                        let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[1],
+                        );
+                        let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[2],
+                        );
+                        let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[3],
+                        );
+
+                        let out0 = #unpacklo_64(v0, v1);
+                        let out1 = #unpacklo_64(v2, v3);
+                        let out2 = #unpackhi_64(v0, v1);
+                        let out3 = #unpackhi_64(v2, v3);
+
+                        #token.#combine_full(
+                            #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)),
+                            #token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)),
+                        )
+                    }
+                })
+            }
             32 | 16 | 8 => {
                 let block_ty =
                     VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
@@ -3010,10 +3131,12 @@ impl X86 {
         let native_ty = self.arch_ty(vec_ty);
         let len = vec_ty.len;
         let permute = avx512_permutexvar_intrinsic(vec_ty);
-        let indices = avx512_index_vector(
-            vec_ty,
-            interleaved_load_indices(vec_ty.len, block_count as usize),
-        );
+        let indices = if vec_ty.scalar_bits == 64 {
+            interleaved_load_indices_64(vec_ty.len, block_count as usize)
+        } else {
+            interleaved_load_indices(vec_ty.len, block_count as usize)
+        };
+        let indices = avx512_index_vector(vec_ty, indices);
 
         self.kernel_method(op, vec_ty, |token| {
             quote! {
@@ -3042,6 +3165,50 @@ impl X86 {
             return self.handle_avx512_store_interleaved(op, vec_ty, block_size, block_count);
         }
         match vec_ty.scalar_bits {
+            64 => {
+                let block_ty =
+                    VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
+                let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
+                let native_ty = self.arch_ty(&block_ty);
+                let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty);
+                let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty);
+
+                let vec_combined =
+                    VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2);
+                let split_half = Ident::new(
+                    &format!("split_{}", vec_combined.rust_name()),
+                    Span::call_site(),
+                );
+                let split_full =
+                    Ident::new(&format!("split_{}", vec_ty.rust_name()), Span::call_site());
+                let block_len = block_size as usize / vec_ty.scalar_bits;
+
+                self.kernel_method(op, vec_ty, |token| {
+                    quote! {
+                        let (v01, v23) = #token.#split_full(a);
+                        let (v0, v1) = #token.#split_half(v01);
+                        let (v2, v3) = #token.#split_half(v23);
+                        let v0 = v0.into();
+                        let v1 = v1.into();
+                        let v2 = v2.into();
+                        let v3 = v3.into();
+
+                        let out0 = #unpacklo_64(v0, v2);
+                        let out1 = #unpackhi_64(v0, v2);
+                        let out2 = #unpacklo_64(v1, v3);
+                        let out3 = #unpackhi_64(v1, v3);
+
+                        let (chunks, []) = dest.as_chunks_mut::<#block_len>() else {
+                            unreachable!()
+                        };
+
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]);
+                    }
+                })
+            }
             32 | 16 | 8 => {
                 let block_ty =
                     VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
@@ -3179,10 +3346,12 @@ impl X86 {
         let native_ty = self.arch_ty(vec_ty);
         let len = vec_ty.len;
         let permute = avx512_permutexvar_intrinsic(vec_ty);
-        let indices = avx512_index_vector(
-            vec_ty,
-            interleaved_store_indices(vec_ty.len, block_count as usize),
-        );
+        let indices = if vec_ty.scalar_bits == 64 {
+            interleaved_store_indices_64(vec_ty.len, block_count as usize)
+        } else {
+            interleaved_store_indices(vec_ty.len, block_count as usize)
+        };
+        let indices = avx512_index_vector(vec_ty, indices);
 
         self.kernel_method(op, vec_ty, |_| {
             quote! {
diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs
index 670d6a5cb..8296652c7 100644
--- a/fearless_simd_gen/src/types.rs
+++ b/fearless_simd_gen/src/types.rs
@@ -281,6 +281,8 @@ pub(crate) const SIMD_TYPES: &[VecType] = &[
     VecType::new(ScalarType::Unsigned, 32, 4),
     VecType::new(ScalarType::Mask, 32, 4),
     VecType::new(ScalarType::Float, 64, 2),
+    VecType::new(ScalarType::Int, 64, 2),
+    VecType::new(ScalarType::Unsigned, 64, 2),
     VecType::new(ScalarType::Mask, 64, 2),
     // 256 bit types
     VecType::new(ScalarType::Float, 32, 8),
@@ -294,6 +296,8 @@ pub(crate) const SIMD_TYPES: &[VecType] = &[
     VecType::new(ScalarType::Unsigned, 32, 8),
     VecType::new(ScalarType::Mask, 32, 8),
     VecType::new(ScalarType::Float, 64, 4),
+    VecType::new(ScalarType::Int, 64, 4),
+    VecType::new(ScalarType::Unsigned, 64, 4),
     VecType::new(ScalarType::Mask, 64, 4),
     // 512 bit types
     VecType::new(ScalarType::Float, 32, 16),
@@ -307,6 +311,8 @@ pub(crate) const SIMD_TYPES: &[VecType] = &[
     VecType::new(ScalarType::Unsigned, 32, 16),
     VecType::new(ScalarType::Mask, 32, 16),
     VecType::new(ScalarType::Float, 64, 8),
+    VecType::new(ScalarType::Int, 64, 8),
+    VecType::new(ScalarType::Unsigned, 64, 8),
     VecType::new(ScalarType::Mask, 64, 8),
 ];
 
diff --git a/fearless_simd_tests/tests/harness/int64.rs b/fearless_simd_tests/tests/harness/int64.rs
new file mode 100644
index 000000000..59457eaf5
--- /dev/null
+++ b/fearless_simd_tests/tests/harness/int64.rs
@@ -0,0 +1,1464 @@
+// Copyright 2026 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use fearless_simd::*;
+use fearless_simd_dev_macros::simd_test;
+
+fn mask_lane(value: bool) -> i64 {
+    if value { -1 } else { 0 }
+}
+
+#[simd_test]
+fn construct_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[-9, 18]);
+    let mut stored = [0_i64; 2];
+    a.store_slice(&mut stored);
+    assert_eq!(stored, [-9, 18]);
+    assert_eq!(*i64x2::splat(simd, -9), [-9, -9]);
+    assert_eq!(*i64x2::simd_from(simd, [-9, 18]), [-9, 18]);
+    assert_eq!(*i64x2::from_fn(simd, |i| [-9, 18][i]), [-9, 18]);
+    assert_eq!(*i64x2::from_bytes(a.to_bytes()), [-9, 18]);
+}
+
+#[simd_test]
+fn construct_i64x4<S: Simd>(simd: S) {
+    let vals = [-9, 18, i64::MAX - 7, i64::MIN + 9];
+    let a = i64x4::from_slice(simd, &vals);
+    let mut stored = [0_i64; 4];
+    a.store_slice(&mut stored);
+    assert_eq!(stored, vals);
+    assert_eq!(*i64x4::splat(simd, -9), [-9, -9, -9, -9]);
+    assert_eq!(*i64x4::simd_from(simd, vals), vals);
+    assert_eq!(*i64x4::from_fn(simd, |i| vals[i]), vals);
+    assert_eq!(*i64x4::from_bytes(a.to_bytes()), vals);
+}
+
+#[simd_test]
+fn construct_i64x8<S: Simd>(simd: S) {
+    let vals = [-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024];
+    let a = i64x8::from_slice(simd, &vals);
+    let mut stored = [0_i64; 8];
+    a.store_slice(&mut stored);
+    assert_eq!(stored, vals);
+    assert_eq!(*i64x8::splat(simd, -9), [-9, -9, -9, -9, -9, -9, -9, -9]);
+    assert_eq!(*i64x8::simd_from(simd, vals), vals);
+    assert_eq!(*i64x8::from_fn(simd, |i| vals[i]), vals);
+    assert_eq!(*i64x8::from_bytes(a.to_bytes()), vals);
+}
+
+#[simd_test]
+fn construct_u64x2<S: Simd>(simd: S) {
+    let vals = [0, 1_u64 << 63];
+    let a = u64x2::from_slice(simd, &vals);
+    let mut stored = [0_u64; 2];
+    a.store_slice(&mut stored);
+    assert_eq!(stored, vals);
+    assert_eq!(*u64x2::splat(simd, vals[0]), [0, 0]);
+    assert_eq!(*u64x2::simd_from(simd, vals), vals);
+    assert_eq!(*u64x2::from_fn(simd, |i| vals[i]), vals);
+    assert_eq!(*u64x2::from_bytes(a.to_bytes()), vals);
+}
+
+#[simd_test]
+fn construct_u64x4<S: Simd>(simd: S) {
+    let vals = [0, 1_u64 << 63, u64::MAX - 3, 42];
+    let a = u64x4::from_slice(simd, &vals);
+    let mut stored = [0_u64; 4];
+    a.store_slice(&mut stored);
+    assert_eq!(stored, vals);
+    assert_eq!(*u64x4::splat(simd, vals[0]), [0, 0, 0, 0]);
+    assert_eq!(*u64x4::simd_from(simd, vals), vals);
+    assert_eq!(*u64x4::from_fn(simd, |i| vals[i]), vals);
+    assert_eq!(*u64x4::from_bytes(a.to_bytes()), vals);
+}
+
+#[simd_test]
+fn construct_u64x8<S: Simd>(simd: S) {
+    let vals = [
+        0,
+        1_u64 << 63,
+        u64::MAX - 3,
+        42,
+        17,
+        99,
+        123456789,
+        u64::MAX,
+    ];
+    let a = u64x8::from_slice(simd, &vals);
+    let mut stored = [0_u64; 8];
+    a.store_slice(&mut stored);
+    assert_eq!(stored, vals);
+    assert_eq!(*u64x8::splat(simd, vals[0]), [0, 0, 0, 0, 0, 0, 0, 0]);
+    assert_eq!(*u64x8::simd_from(simd, vals), vals);
+    assert_eq!(*u64x8::from_fn(simd, |i| vals[i]), vals);
+    assert_eq!(*u64x8::from_bytes(a.to_bytes()), vals);
+}
+
+#[simd_test]
+fn arithmetic_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[-9, 18]);
+    let b = i64x2::from_slice(simd, &[3, -6]);
+    assert_eq!(*(a + b), [-6, 12]);
+    assert_eq!(*(a - b), [-12, 24]);
+    assert_eq!(*(a * b), [-27, -108]);
+}
+
+#[simd_test]
+fn arithmetic_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]);
+    let b = i64x4::from_slice(simd, &[3, -6, 5, -7]);
+    assert_eq!(
+        *(a + b),
+        [-6, 12, 9223372036854775805, -9223372036854775806]
+    );
+    assert_eq!(
+        *(a - b),
+        [-12, 24, 9223372036854775795, -9223372036854775792]
+    );
+    assert_eq!(
+        *(a * b),
+        [-27, -108, 9223372036854775768, 9223372036854775745]
+    );
+}
+
+#[simd_test]
+fn arithmetic_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(
+        simd,
+        &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024],
+    );
+    let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]);
+    assert_eq!(
+        *(a + b),
+        [
+            -6,
+            12,
+            9223372036854775805,
+            -9223372036854775806,
+            112,
+            -443,
+            772,
+            -1005
+        ]
+    );
+    assert_eq!(
+        *(a - b),
+        [
+            -12,
+            24,
+            9223372036854775795,
+            -9223372036854775792,
+            134,
+            -469,
+            806,
+            -1043
+        ]
+    );
+    assert_eq!(
+        *(a * b),
+        [
+            -27,
+            -108,
+            9223372036854775768,
+            9223372036854775745,
+            -1353,
+            -5928,
+            -13413,
+            -19456
+        ]
+    );
+}
+
+#[simd_test]
+fn arithmetic_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]);
+    let b = u64x2::from_slice(simd, &[u64::MAX, 7]);
+    assert_eq!(*(a + b), [u64::MAX, 9223372036854775815]);
+    assert_eq!(*(a - b), [1, 9223372036854775801]);
+    assert_eq!(*(a * b), [0, 1_u64 << 63]);
+}
+
+#[simd_test]
+fn arithmetic_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]);
+    let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]);
+    assert_eq!(*(a + b), [u64::MAX, 9223372036854775815, 9, 1041]);
+    assert_eq!(
+        *(a - b),
+        [
+            1,
+            9223372036854775801,
+            18446744073709551599,
+            18446744073709550659
+        ]
+    );
+    assert_eq!(*(a * b), [0, 1_u64 << 63, 18446744073709551564, 41958]);
+}
+
+#[simd_test]
+fn arithmetic_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(
+        simd,
+        &[
+            0,
+            1_u64 << 63,
+            u64::MAX - 3,
+            42,
+            17,
+            99,
+            123456789,
+            u64::MAX,
+        ],
+    );
+    let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]);
+    assert_eq!(
+        *(a + b),
+        [
+            u64::MAX,
+            9223372036854775815,
+            9,
+            1041,
+            46,
+            110,
+            1111111110,
+            0
+        ]
+    );
+    assert_eq!(
+        *(a - b),
+        [
+            1,
+            9223372036854775801,
+            18446744073709551599,
+            18446744073709550659,
+            18446744073709551604,
+            88,
+            18446744072845354084,
+            18446744073709551614
+        ]
+    );
+    assert_eq!(
+        *(a * b),
+        [
+            0,
+            1_u64 << 63,
+            18446744073709551564,
+            41958,
+            493,
+            1089,
+            121932631112635269,
+            u64::MAX
+        ]
+    );
+}
+
+#[simd_test]
+fn neg_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(simd, &[-1, 2, -3, 4, -5, 6, -7, 8]);
+    assert_eq!(*(-a), [1, -2, 3, -4, 5, -6, 7, -8]);
+}
+
+#[simd_test]
+fn bitwise_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[-9, 18]);
+    let b = i64x2::from_slice(simd, &[3, -6]);
+    assert_eq!(*(a & b), [3, 18]);
+    assert_eq!(*(a | b), [-9, -6]);
+    assert_eq!(*(a ^ b), [-12, -24]);
+    assert_eq!(*(!a), [8, -19]);
+}
+
+#[simd_test]
+fn bitwise_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]);
+    let b = i64x4::from_slice(simd, &[3, -6, 5, -7]);
+    assert_eq!(*(a & b), [3, 18, 0, -9223372036854775799]);
+    assert_eq!(*(a | b), [-9, -6, 9223372036854775805, -7]);
+    assert_eq!(
+        *(a ^ b),
+        [-12, -24, 9223372036854775805, 9223372036854775792]
+    );
+    assert_eq!(*(!a), [8, -19, -9223372036854775801, 9223372036854775798]);
+}
+
+#[simd_test]
+fn bitwise_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(
+        simd,
+        &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024],
+    );
+    let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]);
+    assert_eq!(*(a & b), [3, 18, 0, -9223372036854775799, 113, 8, 773, 0]);
+    assert_eq!(
+        *(a | b),
+        [-9, -6, 9223372036854775805, -7, -1, -451, -1, -1005]
+    );
+    assert_eq!(
+        *(a ^ b),
+        [
+            -12,
+            -24,
+            9223372036854775805,
+            9223372036854775792,
+            -114,
+            -459,
+            -774,
+            -1005
+        ]
+    );
+    assert_eq!(
+        *(!a),
+        [
+            8,
+            -19,
+            -9223372036854775801,
+            9223372036854775798,
+            -124,
+            455,
+            -790,
+            1023
+        ]
+    );
+}
+
+#[simd_test]
+fn bitwise_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]);
+    let b = u64x2::from_slice(simd, &[u64::MAX, 7]);
+    assert_eq!(*(a & b), [0, 0]);
+    assert_eq!(*(a | b), [u64::MAX, 9223372036854775815]);
+    assert_eq!(*(a ^ b), [u64::MAX, 9223372036854775815]);
+    assert_eq!(*(!a), [u64::MAX, 9223372036854775807]);
+}
+
+#[simd_test]
+fn bitwise_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]);
+    let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]);
+    assert_eq!(*(a & b), [0, 0, 12, 34]);
+    assert_eq!(
+        *(a | b),
+        [u64::MAX, 9223372036854775815, 18446744073709551613, 1007]
+    );
+    assert_eq!(
+        *(a ^ b),
+        [u64::MAX, 9223372036854775815, 18446744073709551601, 973]
+    );
+    assert_eq!(
+        *(!a),
+        [u64::MAX, 9223372036854775807, 3, 18446744073709551573]
+    );
+}
+
+#[simd_test]
+fn bitwise_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(
+        simd,
+        &[
+            0,
+            1_u64 << 63,
+            u64::MAX - 3,
+            42,
+            17,
+            99,
+            123456789,
+            u64::MAX,
+        ],
+    );
+    let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]);
+    assert_eq!(*(a & b), [0, 0, 12, 34, 17, 3, 39471121, 1]);
+    assert_eq!(
+        *(a | b),
+        [
+            u64::MAX,
+            9223372036854775815,
+            18446744073709551613,
+            1007,
+            29,
+            107,
+            1071639989,
+            u64::MAX
+        ]
+    );
+    assert_eq!(
+        *(a ^ b),
+        [
+            u64::MAX,
+            9223372036854775815,
+            18446744073709551601,
+            973,
+            12,
+            104,
+            1032168868,
+            18446744073709551614
+        ]
+    );
+    assert_eq!(
+        *(!a),
+        [
+            u64::MAX,
+            9223372036854775807,
+            3,
+            18446744073709551573,
+            18446744073709551598,
+            18446744073709551516,
+            18446744073586094826,
+            0
+        ]
+    );
+}
+
+#[simd_test]
+fn shifts_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[-9, 18]);
+    let shifts = i64x2::from_slice(simd, &[0, 1]);
+    assert_eq!(*(a << 2_u32), [-36, 72]);
+    assert_eq!(*(a >> 2_u32), [-3, 4]);
+    assert_eq!(*(a << shifts), [-9, 36]);
+    assert_eq!(*(a >> shifts), [-9, 9]);
+}
+
+#[simd_test]
+fn shifts_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]);
+    let shifts = i64x4::from_slice(simd, &[0, 1, 2, 3]);
+    assert_eq!(*(a << 2_u32), [-36, 72, -32, 36]);
+    assert_eq!(
+        *(a >> 2_u32),
+        [-3, 4, 2305843009213693950, -2305843009213693950]
+    );
+    assert_eq!(*(a << shifts), [-9, 36, -32, 72]);
+    assert_eq!(
+        *(a >> shifts),
+        [-9, 9, 2305843009213693950, -1152921504606846975]
+    );
+}
+
+#[simd_test]
+fn shifts_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(
+        simd,
+        &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024],
+    );
+    let shifts = i64x8::from_slice(simd, &[0, 1, 2, 3, 0, 1, 2, 3]);
+    assert_eq!(*(a << 2_u32), [-36, 72, -32, 36, 492, -1824, 3156, -4096]);
+    assert_eq!(
+        *(a >> 2_u32),
+        [
+            -3,
+            4,
+            2305843009213693950,
+            -2305843009213693950,
+            30,
+            -114,
+            197,
+            -256
+        ]
+    );
+    assert_eq!(*(a << shifts), [-9, 36, -32, 72, 123, -912, 3156, -8192]);
+    assert_eq!(
+        *(a >> shifts),
+        [
+            -9,
+            9,
+            2305843009213693950,
+            -1152921504606846975,
+            123,
+            -228,
+            197,
+            -128
+        ]
+    );
+}
+
+#[simd_test]
+fn shifts_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]);
+    let shifts = u64x2::from_slice(simd, &[0, 1]);
+    assert_eq!(*(a << 2_u32), [0, 0]);
+    assert_eq!(*(a >> 2_u32), [0, 2305843009213693952]);
+    assert_eq!(*(a << shifts), [0, 0]);
+    assert_eq!(*(a >> shifts), [0, 4611686018427387904]);
+}
+
+#[simd_test]
+fn shifts_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]);
+    let shifts = u64x4::from_slice(simd, &[0, 1, 2, 3]);
+    assert_eq!(*(a << 2_u32), [0, 0, 18446744073709551600, 168]);
+    assert_eq!(
+        *(a >> 2_u32),
+        [0, 2305843009213693952, 4611686018427387903, 10]
+    );
+    assert_eq!(*(a << shifts), [0, 0, 18446744073709551600, 336]);
+    assert_eq!(
+        *(a >> shifts),
+        [0, 4611686018427387904, 4611686018427387903, 5]
+    );
+}
+
+#[simd_test]
+fn shifts_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(
+        simd,
+        &[
+            0,
+            1_u64 << 63,
+            u64::MAX - 3,
+            42,
+            17,
+            99,
+            123456789,
+            u64::MAX,
+        ],
+    );
+    let shifts = u64x8::from_slice(simd, &[0, 1, 2, 3, 0, 1, 2, 3]);
+    assert_eq!(
+        *(a << 2_u32),
+        [
+            0,
+            0,
+            18446744073709551600,
+            168,
+            68,
+            396,
+            493827156,
+            18446744073709551612
+        ]
+    );
+    assert_eq!(
+        *(a >> 2_u32),
+        [
+            0,
+            2305843009213693952,
+            4611686018427387903,
+            10,
+            4,
+            24,
+            30864197,
+            4611686018427387903
+        ]
+    );
+    assert_eq!(
+        *(a << shifts),
+        [
+            0,
+            0,
+            18446744073709551600,
+            336,
+            17,
+            198,
+            493827156,
+            18446744073709551608
+        ]
+    );
+    assert_eq!(
+        *(a >> shifts),
+        [
+            0,
+            4611686018427387904,
+            4611686018427387903,
+            5,
+            17,
+            49,
+            30864197,
+            2305843009213693951
+        ]
+    );
+}
+
+#[simd_test]
+fn compare_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[-9, 18]);
+    let b = i64x2::from_slice(simd, &[3, -6]);
+    assert_eq!(<[i64; 2]>::from(a.simd_eq(b)), [0, 0]);
+    assert_eq!(<[i64; 2]>::from(a.simd_lt(b)), [-1, 0]);
+    assert_eq!(<[i64; 2]>::from(a.simd_le(b)), [-1, 0]);
+    assert_eq!(<[i64; 2]>::from(a.simd_ge(b)), [0, -1]);
+    assert_eq!(<[i64; 2]>::from(a.simd_gt(b)), [0, -1]);
+}
+
+#[simd_test]
+fn compare_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]);
+    let b = i64x4::from_slice(simd, &[3, -6, 5, -7]);
+    assert_eq!(<[i64; 4]>::from(a.simd_eq(b)), [0, 0, 0, 0]);
+    assert_eq!(<[i64; 4]>::from(a.simd_lt(b)), [-1, 0, 0, -1]);
+    assert_eq!(<[i64; 4]>::from(a.simd_le(b)), [-1, 0, 0, -1]);
+    assert_eq!(<[i64; 4]>::from(a.simd_ge(b)), [0, -1, -1, 0]);
+    assert_eq!(<[i64; 4]>::from(a.simd_gt(b)), [0, -1, -1, 0]);
+}
+
+#[simd_test]
+fn compare_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(
+        simd,
+        &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024],
+    );
+    let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]);
+    assert_eq!(<[i64; 8]>::from(a.simd_eq(b)), [0, 0, 0, 0, 0, 0, 0, 0]);
+    assert_eq!(<[i64; 8]>::from(a.simd_lt(b)), [-1, 0, 0, -1, 0, -1, 0, -1]);
+    assert_eq!(<[i64; 8]>::from(a.simd_le(b)), [-1, 0, 0, -1, 0, -1, 0, -1]);
+    assert_eq!(<[i64; 8]>::from(a.simd_ge(b)), [0, -1, -1, 0, -1, 0, -1, 0]);
+    assert_eq!(<[i64; 8]>::from(a.simd_gt(b)), [0, -1, -1, 0, -1, 0, -1, 0]);
+}
+
+#[simd_test]
+fn compare_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]);
+    let b = u64x2::from_slice(simd, &[u64::MAX, 7]);
+    assert_eq!(<[i64; 2]>::from(a.simd_eq(b)), [0, 0]);
+    assert_eq!(<[i64; 2]>::from(a.simd_lt(b)), [-1, 0]);
+    assert_eq!(<[i64; 2]>::from(a.simd_le(b)), [-1, 0]);
+    assert_eq!(<[i64; 2]>::from(a.simd_ge(b)), [0, -1]);
+    assert_eq!(<[i64; 2]>::from(a.simd_gt(b)), [0, -1]);
+}
+
+#[simd_test]
+fn compare_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]);
+    let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]);
+    assert_eq!(<[i64; 4]>::from(a.simd_eq(b)), [0, 0, 0, 0]);
+    assert_eq!(<[i64; 4]>::from(a.simd_lt(b)), [-1, 0, 0, -1]);
+    assert_eq!(<[i64; 4]>::from(a.simd_le(b)), [-1, 0, 0, -1]);
+    assert_eq!(<[i64; 4]>::from(a.simd_ge(b)), [0, -1, -1, 0]);
+    assert_eq!(<[i64; 4]>::from(a.simd_gt(b)), [0, -1, -1, 0]);
+}
+
+#[simd_test]
+fn compare_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(
+        simd,
+        &[
+            0,
+            1_u64 << 63,
+            u64::MAX - 3,
+            42,
+            17,
+            99,
+            123456789,
+            u64::MAX,
+        ],
+    );
+    let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]);
+    assert_eq!(<[i64; 8]>::from(a.simd_eq(b)), [0, 0, 0, 0, 0, 0, 0, 0]);
+    assert_eq!(<[i64; 8]>::from(a.simd_lt(b)), [-1, 0, 0, -1, -1, 0, -1, 0]);
+    assert_eq!(<[i64; 8]>::from(a.simd_le(b)), [-1, 0, 0, -1, -1, 0, -1, 0]);
+    assert_eq!(<[i64; 8]>::from(a.simd_ge(b)), [0, -1, -1, 0, 0, -1, 0, -1]);
+    assert_eq!(<[i64; 8]>::from(a.simd_gt(b)), [0, -1, -1, 0, 0, -1, 0, -1]);
+}
+
+#[simd_test]
+fn min_max_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[-9, 18]);
+    let b = i64x2::from_slice(simd, &[3, -6]);
+    assert_eq!(*a.min(b), [-9, -6]);
+    assert_eq!(*a.max(b), [3, 18]);
+}
+
+#[simd_test]
+fn min_max_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]);
+    let b = i64x4::from_slice(simd, &[3, -6, 5, -7]);
+    assert_eq!(*a.min(b), [-9, -6, 5, -9223372036854775799]);
+    assert_eq!(*a.max(b), [3, 18, 9223372036854775800, -7]);
+}
+
+#[simd_test]
+fn min_max_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(
+        simd,
+        &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024],
+    );
+    let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]);
+    assert_eq!(
+        *a.min(b),
+        [-9, -6, 5, -9223372036854775799, -11, -456, -17, -1024]
+    );
+    assert_eq!(
+        *a.max(b),
+        [3, 18, 9223372036854775800, -7, 123, 13, 789, 19]
+    );
+}
+
+#[simd_test]
+fn min_max_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]);
+    let b = u64x2::from_slice(simd, &[u64::MAX, 7]);
+    assert_eq!(*a.min(b), [0, 7]);
+    assert_eq!(*a.max(b), [u64::MAX, 1_u64 << 63]);
+}
+
+#[simd_test]
+fn min_max_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]);
+    let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]);
+    assert_eq!(*a.min(b), [0, 7, 13, 42]);
+    assert_eq!(*a.max(b), [u64::MAX, 1_u64 << 63, u64::MAX - 3, 999]);
+}
+
+#[simd_test]
+fn min_max_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(
+        simd,
+        &[
+            0,
+            1_u64 << 63,
+            u64::MAX - 3,
+            42,
+            17,
+            99,
+            123456789,
+            u64::MAX,
+        ],
+    );
+    let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]);
+    assert_eq!(*a.min(b), [0, 7, 13, 42, 17, 11, 123456789, 1]);
+    assert_eq!(
+        *a.max(b),
+        [
+            u64::MAX,
+            1_u64 << 63,
+            u64::MAX - 3,
+            999,
+            29,
+            99,
+            987654321,
+            u64::MAX
+        ]
+    );
+}
+
+#[simd_test]
+fn select_i64x2<S: Simd>(simd: S) {
+    let mask = mask64x2::from_slice(simd, &[-1, 0]);
+    let a = i64x2::from_slice(simd, &[-9, 18]);
+    let b = i64x2::from_slice(simd, &[3, -6]);
+    assert_eq!(*mask.select(a, b), [-9, -6]);
+}
+
+#[simd_test]
+fn select_i64x4<S: Simd>(simd: S) {
+    let mask = mask64x4::from_slice(simd, &[-1, 0, -1, 0]);
+    let a = i64x4::from_slice(simd, &[-9, 18, i64::MAX - 7, i64::MIN + 9]);
+    let b = i64x4::from_slice(simd, &[3, -6, 5, -7]);
+    assert_eq!(*mask.select(a, b), [-9, -6, 9223372036854775800, -7]);
+}
+
+#[simd_test]
+fn select_i64x8<S: Simd>(simd: S) {
+    let mask = mask64x8::from_slice(simd, &[-1, 0, -1, 0, -1, 0, -1, 0]);
+    let a = i64x8::from_slice(
+        simd,
+        &[-9, 18, i64::MAX - 7, i64::MIN + 9, 123, -456, 789, -1024],
+    );
+    let b = i64x8::from_slice(simd, &[3, -6, 5, -7, -11, 13, -17, 19]);
+    assert_eq!(
+        *mask.select(a, b),
+        [-9, -6, 9223372036854775800, -7, 123, 13, 789, 19]
+    );
+}
+
+#[simd_test]
+fn select_u64x2<S: Simd>(simd: S) {
+    let mask = mask64x2::from_slice(simd, &[-1, 0]);
+    let a = u64x2::from_slice(simd, &[0, 1_u64 << 63]);
+    let b = u64x2::from_slice(simd, &[u64::MAX, 7]);
+    assert_eq!(*mask.select(a, b), [0, 7]);
+}
+
+#[simd_test]
+fn select_u64x4<S: Simd>(simd: S) {
+    let mask = mask64x4::from_slice(simd, &[-1, 0, -1, 0]);
+    let a = u64x4::from_slice(simd, &[0, 1_u64 << 63, u64::MAX - 3, 42]);
+    let b = u64x4::from_slice(simd, &[u64::MAX, 7, 13, 999]);
+    assert_eq!(*mask.select(a, b), [0, 7, u64::MAX - 3, 999]);
+}
+
+#[simd_test]
+fn select_u64x8<S: Simd>(simd: S) {
+    let mask = mask64x8::from_slice(simd, &[-1, 0, -1, 0, -1, 0, -1, 0]);
+    let a = u64x8::from_slice(
+        simd,
+        &[
+            0,
+            1_u64 << 63,
+            u64::MAX - 3,
+            42,
+            17,
+            99,
+            123456789,
+            u64::MAX,
+        ],
+    );
+    let b = u64x8::from_slice(simd, &[u64::MAX, 7, 13, 999, 29, 11, 987654321, 1]);
+    assert_eq!(
+        *mask.select(a, b),
+        [0, 7, u64::MAX - 3, 999, 17, 11, 123456789, 1]
+    );
+}
+
+#[simd_test]
+fn slide_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[1, 2]);
+    let b = i64x2::from_slice(simd, &[3, 4]);
+    assert_eq!(*a.slide::<1>(b), [2, 3]);
+}
+
+#[simd_test]
+fn slide_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = i64x4::from_slice(simd, &[5, 6, 7, 8]);
+    assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5]);
+}
+
+#[simd_test]
+fn slide_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5, 6, 7, 8, 9]);
+}
+
+#[simd_test]
+fn slide_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[1, 2]);
+    let b = u64x2::from_slice(simd, &[3, 4]);
+    assert_eq!(*a.slide::<1>(b), [2, 3]);
+}
+
+#[simd_test]
+fn slide_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = u64x4::from_slice(simd, &[5, 6, 7, 8]);
+    assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5]);
+}
+
+#[simd_test]
+fn slide_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5, 6, 7, 8, 9]);
+}
+
+#[simd_test]
+fn i64_split_combine<S: Simd>(simd: S) {
+    let lo = i64x2::from_slice(simd, &[1, 2]);
+    let hi = i64x2::from_slice(simd, &[3, 4]);
+    let combined = lo.combine(hi);
+    assert_eq!(*combined, [1, 2, 3, 4]);
+
+    let (lo, hi) = combined.split();
+    assert_eq!(*lo, [1, 2]);
+    assert_eq!(*hi, [3, 4]);
+
+    let tail = i64x4::from_slice(simd, &[5, 6, 7, 8]);
+    let wide = combined.combine(tail);
+    assert_eq!(*wide, [1, 2, 3, 4, 5, 6, 7, 8]);
+
+    let (lo, hi) = wide.split();
+    assert_eq!(*lo, [1, 2, 3, 4]);
+    assert_eq!(*hi, [5, 6, 7, 8]);
+}
+
+#[simd_test]
+fn u64_split_combine<S: Simd>(simd: S) {
+    let lo = u64x2::from_slice(simd, &[1, 2]);
+    let hi = u64x2::from_slice(simd, &[3, 4]);
+    let combined = lo.combine(hi);
+    assert_eq!(*combined, [1, 2, 3, 4]);
+
+    let (lo, hi) = combined.split();
+    assert_eq!(*lo, [1, 2]);
+    assert_eq!(*hi, [3, 4]);
+
+    let tail = u64x4::from_slice(simd, &[5, 6, 7, 8]);
+    let wide = combined.combine(tail);
+    assert_eq!(*wide, [1, 2, 3, 4, 5, 6, 7, 8]);
+
+    let (lo, hi) = wide.split();
+    assert_eq!(*lo, [1, 2, 3, 4]);
+    assert_eq!(*hi, [5, 6, 7, 8]);
+}
+
+#[simd_test]
+fn native_width_i64_u64<S: Simd>(simd: S) {
+    let mask_vals: Vec<i64> = (0..S::mask64s::N).map(|i| mask_lane(i % 2 == 0)).collect();
+    let mask = S::mask64s::from_slice(simd, &mask_vals);
+
+    let u_true: Vec<u64> = (0..S::u64s::N).map(|i| (1_u64 << 63) + i as u64).collect();
+    let u_false: Vec<u64> = (0..S::u64s::N).map(|i| i as u64).collect();
+    let u_selected = mask.select(
+        S::u64s::from_slice(simd, &u_true),
+        S::u64s::from_slice(simd, &u_false),
+    );
+    let u_expected: Vec<u64> = (0..S::u64s::N)
+        .map(|i| if i % 2 == 0 { u_true[i] } else { u_false[i] })
+        .collect();
+    assert_eq!(u_selected.as_slice(), u_expected);
+    assert_eq!(
+        (S::u64s::splat(simd, 3) * 7).as_slice(),
+        vec![21; S::u64s::N]
+    );
+
+    let i_true: Vec<i64> = (0..S::i64s::N).map(|i| -(i as i64) - 1).collect();
+    let i_false: Vec<i64> = (0..S::i64s::N).map(|i| i as i64 + 1).collect();
+    let i_selected = mask.select(
+        S::i64s::from_slice(simd, &i_true),
+        S::i64s::from_slice(simd, &i_false),
+    );
+    let i_expected: Vec<i64> = (0..S::i64s::N)
+        .map(|i| if i % 2 == 0 { i_true[i] } else { i_false[i] })
+        .collect();
+    assert_eq!(i_selected.as_slice(), i_expected);
+    assert_eq!(
+        (S::i64s::block_splat(i64x2::from_slice(simd, &[11, -12]))).as_slice(),
+        [11, -12].repeat(S::i64s::N / 2)
+    );
+    assert_eq!(
+        (S::u64s::block_splat(u64x2::from_slice(simd, &[13, 14]))).as_slice(),
+        [13, 14].repeat(S::u64s::N / 2)
+    );
+}
+
+#[simd_test]
+fn array_methods_i64x2<S: Simd>(simd: S) {
+    let a = simd.load_array_i64x2([1, 2]);
+    assert_eq!(simd.as_array_i64x2(a), [1, 2]);
+
+    let b_vals = [3, 4];
+    let mut b = simd.load_array_ref_i64x2(&b_vals);
+    assert_eq!(simd.as_array_ref_i64x2(&b), &[3, 4]);
+
+    simd.as_array_mut_i64x2(&mut b)[1] = 9;
+    assert_eq!(*b, [3, 9]);
+
+    let mut dest = [0_i64; 2];
+    simd.store_array_i64x2(b, &mut dest);
+    assert_eq!(dest, [3, 9]);
+}
+
+#[simd_test]
+fn array_methods_i64x4<S: Simd>(simd: S) {
+    let a = simd.load_array_i64x4([1, 2, 3, 4]);
+    assert_eq!(simd.as_array_i64x4(a), [1, 2, 3, 4]);
+
+    let b_vals = [5, 6, 7, 8];
+    let mut b = simd.load_array_ref_i64x4(&b_vals);
+    assert_eq!(simd.as_array_ref_i64x4(&b), &[5, 6, 7, 8]);
+
+    simd.as_array_mut_i64x4(&mut b)[2] = 99;
+    assert_eq!(*b, [5, 6, 99, 8]);
+
+    let mut dest = [0_i64; 4];
+    simd.store_array_i64x4(b, &mut dest);
+    assert_eq!(dest, [5, 6, 99, 8]);
+}
+
+#[simd_test]
+fn array_methods_i64x8<S: Simd>(simd: S) {
+    let a = simd.load_array_i64x8([1, 2, 3, 4, 5, 6, 7, 8]);
+    assert_eq!(simd.as_array_i64x8(a), [1, 2, 3, 4, 5, 6, 7, 8]);
+
+    let b_vals = [9, 10, 11, 12, 13, 14, 15, 16];
+    let mut b = simd.load_array_ref_i64x8(&b_vals);
+    assert_eq!(
+        simd.as_array_ref_i64x8(&b),
+        &[9, 10, 11, 12, 13, 14, 15, 16]
+    );
+
+    simd.as_array_mut_i64x8(&mut b)[4] = 99;
+    assert_eq!(*b, [9, 10, 11, 12, 99, 14, 15, 16]);
+
+    let mut dest = [0_i64; 8];
+    simd.store_array_i64x8(b, &mut dest);
+    assert_eq!(dest, [9, 10, 11, 12, 99, 14, 15, 16]);
+}
+
+#[simd_test]
+fn array_methods_u64x2<S: Simd>(simd: S) {
+    let a = simd.load_array_u64x2([1, 2]);
+    assert_eq!(simd.as_array_u64x2(a), [1, 2]);
+
+    let b_vals = [3, 4];
+    let mut b = simd.load_array_ref_u64x2(&b_vals);
+    assert_eq!(simd.as_array_ref_u64x2(&b), &[3, 4]);
+
+    simd.as_array_mut_u64x2(&mut b)[1] = 9;
+    assert_eq!(*b, [3, 9]);
+
+    let mut dest = [0_u64; 2];
+    simd.store_array_u64x2(b, &mut dest);
+    assert_eq!(dest, [3, 9]);
+}
+
+#[simd_test]
+fn array_methods_u64x4<S: Simd>(simd: S) {
+    let a = simd.load_array_u64x4([1, 2, 3, 4]);
+    assert_eq!(simd.as_array_u64x4(a), [1, 2, 3, 4]);
+
+    let b_vals = [5, 6, 7, 8];
+    let mut b = simd.load_array_ref_u64x4(&b_vals);
+    assert_eq!(simd.as_array_ref_u64x4(&b), &[5, 6, 7, 8]);
+
+    simd.as_array_mut_u64x4(&mut b)[2] = 99;
+    assert_eq!(*b, [5, 6, 99, 8]);
+
+    let mut dest = [0_u64; 4];
+    simd.store_array_u64x4(b, &mut dest);
+    assert_eq!(dest, [5, 6, 99, 8]);
+}
+
+#[simd_test]
+fn array_methods_u64x8<S: Simd>(simd: S) {
+    let a = simd.load_array_u64x8([1, 2, 3, 4, 5, 6, 7, 8]);
+    assert_eq!(simd.as_array_u64x8(a), [1, 2, 3, 4, 5, 6, 7, 8]);
+
+    let b_vals = [9, 10, 11, 12, 13, 14, 15, 16];
+    let mut b = simd.load_array_ref_u64x8(&b_vals);
+    assert_eq!(
+        simd.as_array_ref_u64x8(&b),
+        &[9, 10, 11, 12, 13, 14, 15, 16]
+    );
+
+    simd.as_array_mut_u64x8(&mut b)[4] = 99;
+    assert_eq!(*b, [9, 10, 11, 12, 99, 14, 15, 16]);
+
+    let mut dest = [0_u64; 8];
+    simd.store_array_u64x8(b, &mut dest);
+    assert_eq!(dest, [9, 10, 11, 12, 99, 14, 15, 16]);
+}
+
+#[simd_test]
+fn neg_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[-1, 2]);
+    assert_eq!(*(-a), [1, -2]);
+}
+
+#[simd_test]
+fn neg_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[-1, 2, -3, 4]);
+    assert_eq!(*(-a), [1, -2, 3, -4]);
+}
+
+#[simd_test]
+fn slide_within_blocks_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[1, 2]);
+    let b = i64x2::from_slice(simd, &[3, 4]);
+    assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2]);
+    assert_eq!(*a.slide_within_blocks::<1>(b), [2, 3]);
+    assert_eq!(*a.slide_within_blocks::<2>(b), [3, 4]);
+}
+
+#[simd_test]
+fn slide_within_blocks_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = i64x4::from_slice(simd, &[5, 6, 7, 8]);
+    assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4]);
+    assert_eq!(*a.slide_within_blocks::<1>(b), [2, 5, 4, 7]);
+    assert_eq!(*a.slide_within_blocks::<2>(b), [5, 6, 7, 8]);
+}
+
+#[simd_test]
+fn slide_within_blocks_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4, 5, 6, 7, 8]);
+    assert_eq!(*a.slide_within_blocks::<1>(b), [2, 9, 4, 11, 6, 13, 8, 15]);
+    assert_eq!(
+        *a.slide_within_blocks::<2>(b),
+        [9, 10, 11, 12, 13, 14, 15, 16]
+    );
+}
+
+#[simd_test]
+fn slide_within_blocks_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[1, 2]);
+    let b = u64x2::from_slice(simd, &[3, 4]);
+    assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2]);
+    assert_eq!(*a.slide_within_blocks::<1>(b), [2, 3]);
+    assert_eq!(*a.slide_within_blocks::<2>(b), [3, 4]);
+}
+
+#[simd_test]
+fn slide_within_blocks_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = u64x4::from_slice(simd, &[5, 6, 7, 8]);
+    assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4]);
+    assert_eq!(*a.slide_within_blocks::<1>(b), [2, 5, 4, 7]);
+    assert_eq!(*a.slide_within_blocks::<2>(b), [5, 6, 7, 8]);
+}
+
+#[simd_test]
+fn slide_within_blocks_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    assert_eq!(*a.slide_within_blocks::<0>(b), [1, 2, 3, 4, 5, 6, 7, 8]);
+    assert_eq!(*a.slide_within_blocks::<1>(b), [2, 9, 4, 11, 6, 13, 8, 15]);
+    assert_eq!(
+        *a.slide_within_blocks::<2>(b),
+        [9, 10, 11, 12, 13, 14, 15, 16]
+    );
+}
+
+#[simd_test]
+fn zip_unzip_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[1, 2]);
+    let b = i64x2::from_slice(simd, &[3, 4]);
+    assert_eq!(*simd.zip_low_i64x2(a, b), [1, 3]);
+    assert_eq!(*simd.zip_high_i64x2(a, b), [2, 4]);
+    assert_eq!(*simd.unzip_low_i64x2(a, b), [1, 3]);
+    assert_eq!(*simd.unzip_high_i64x2(a, b), [2, 4]);
+}
+
+#[simd_test]
+fn zip_unzip_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = i64x4::from_slice(simd, &[5, 6, 7, 8]);
+    assert_eq!(*simd.zip_low_i64x4(a, b), [1, 5, 2, 6]);
+    assert_eq!(*simd.zip_high_i64x4(a, b), [3, 7, 4, 8]);
+    assert_eq!(*simd.unzip_low_i64x4(a, b), [1, 3, 5, 7]);
+    assert_eq!(*simd.unzip_high_i64x4(a, b), [2, 4, 6, 8]);
+}
+
+#[simd_test]
+fn zip_unzip_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    assert_eq!(*simd.zip_low_i64x8(a, b), [1, 9, 2, 10, 3, 11, 4, 12]);
+    assert_eq!(*simd.zip_high_i64x8(a, b), [5, 13, 6, 14, 7, 15, 8, 16]);
+    assert_eq!(*simd.unzip_low_i64x8(a, b), [1, 3, 5, 7, 9, 11, 13, 15]);
+    assert_eq!(*simd.unzip_high_i64x8(a, b), [2, 4, 6, 8, 10, 12, 14, 16]);
+}
+
+#[simd_test]
+fn zip_unzip_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[1, 2]);
+    let b = u64x2::from_slice(simd, &[3, 4]);
+    assert_eq!(*simd.zip_low_u64x2(a, b), [1, 3]);
+    assert_eq!(*simd.zip_high_u64x2(a, b), [2, 4]);
+    assert_eq!(*simd.unzip_low_u64x2(a, b), [1, 3]);
+    assert_eq!(*simd.unzip_high_u64x2(a, b), [2, 4]);
+}
+
+#[simd_test]
+fn zip_unzip_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = u64x4::from_slice(simd, &[5, 6, 7, 8]);
+    assert_eq!(*simd.zip_low_u64x4(a, b), [1, 5, 2, 6]);
+    assert_eq!(*simd.zip_high_u64x4(a, b), [3, 7, 4, 8]);
+    assert_eq!(*simd.unzip_low_u64x4(a, b), [1, 3, 5, 7]);
+    assert_eq!(*simd.unzip_high_u64x4(a, b), [2, 4, 6, 8]);
+}
+
+#[simd_test]
+fn zip_unzip_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    assert_eq!(*simd.zip_low_u64x8(a, b), [1, 9, 2, 10, 3, 11, 4, 12]);
+    assert_eq!(*simd.zip_high_u64x8(a, b), [5, 13, 6, 14, 7, 15, 8, 16]);
+    assert_eq!(*simd.unzip_low_u64x8(a, b), [1, 3, 5, 7, 9, 11, 13, 15]);
+    assert_eq!(*simd.unzip_high_u64x8(a, b), [2, 4, 6, 8, 10, 12, 14, 16]);
+}
+
+#[simd_test]
+fn interleave_deinterleave_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[1, 2]);
+    let b = i64x2::from_slice(simd, &[3, 4]);
+    let (lo, hi) = simd.interleave_i64x2(a, b);
+    assert_eq!(*lo, [1, 3]);
+    assert_eq!(*hi, [2, 4]);
+    let (a_roundtrip, b_roundtrip) = simd.deinterleave_i64x2(lo, hi);
+    assert_eq!(*a_roundtrip, [1, 2]);
+    assert_eq!(*b_roundtrip, [3, 4]);
+}
+
+#[simd_test]
+fn interleave_deinterleave_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = i64x4::from_slice(simd, &[5, 6, 7, 8]);
+    let (lo, hi) = simd.interleave_i64x4(a, b);
+    assert_eq!(*lo, [1, 5, 2, 6]);
+    assert_eq!(*hi, [3, 7, 4, 8]);
+    let (a_roundtrip, b_roundtrip) = simd.deinterleave_i64x4(lo, hi);
+    assert_eq!(*a_roundtrip, [1, 2, 3, 4]);
+    assert_eq!(*b_roundtrip, [5, 6, 7, 8]);
+
+    let (lo, hi) = a.interleave(b);
+    let (a_roundtrip, b_roundtrip) = lo.deinterleave(hi);
+    assert_eq!(*a_roundtrip, [1, 2, 3, 4]);
+    assert_eq!(*b_roundtrip, [5, 6, 7, 8]);
+}
+
+#[simd_test]
+fn interleave_deinterleave_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = i64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    let (lo, hi) = simd.interleave_i64x8(a, b);
+    assert_eq!(*lo, [1, 9, 2, 10, 3, 11, 4, 12]);
+    assert_eq!(*hi, [5, 13, 6, 14, 7, 15, 8, 16]);
+    let (a_roundtrip, b_roundtrip) = simd.deinterleave_i64x8(lo, hi);
+    assert_eq!(*a_roundtrip, [1, 2, 3, 4, 5, 6, 7, 8]);
+    assert_eq!(*b_roundtrip, [9, 10, 11, 12, 13, 14, 15, 16]);
+}
+
+#[simd_test]
+fn interleave_deinterleave_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[1, 2]);
+    let b = u64x2::from_slice(simd, &[3, 4]);
+    let (lo, hi) = simd.interleave_u64x2(a, b);
+    assert_eq!(*lo, [1, 3]);
+    assert_eq!(*hi, [2, 4]);
+    let (a_roundtrip, b_roundtrip) = simd.deinterleave_u64x2(lo, hi);
+    assert_eq!(*a_roundtrip, [1, 2]);
+    assert_eq!(*b_roundtrip, [3, 4]);
+}
+
+#[simd_test]
+fn interleave_deinterleave_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[1, 2, 3, 4]);
+    let b = u64x4::from_slice(simd, &[5, 6, 7, 8]);
+    let (lo, hi) = simd.interleave_u64x4(a, b);
+    assert_eq!(*lo, [1, 5, 2, 6]);
+    assert_eq!(*hi, [3, 7, 4, 8]);
+    let (a_roundtrip, b_roundtrip) = simd.deinterleave_u64x4(lo, hi);
+    assert_eq!(*a_roundtrip, [1, 2, 3, 4]);
+    assert_eq!(*b_roundtrip, [5, 6, 7, 8]);
+
+    let (lo, hi) = a.interleave(b);
+    let (a_roundtrip, b_roundtrip) = lo.deinterleave(hi);
+    assert_eq!(*a_roundtrip, [1, 2, 3, 4]);
+    assert_eq!(*b_roundtrip, [5, 6, 7, 8]);
+}
+
+#[simd_test]
+fn interleave_deinterleave_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]);
+    let b = u64x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]);
+    let (lo, hi) = simd.interleave_u64x8(a, b);
+    assert_eq!(*lo, [1, 9, 2, 10, 3, 11, 4, 12]);
+    assert_eq!(*hi, [5, 13, 6, 14, 7, 15, 8, 16]);
+    let (a_roundtrip, b_roundtrip) = simd.deinterleave_u64x8(lo, hi);
+    assert_eq!(*a_roundtrip, [1, 2, 3, 4, 5, 6, 7, 8]);
+    assert_eq!(*b_roundtrip, [9, 10, 11, 12, 13, 14, 15, 16]);
+}
+
+#[simd_test]
+fn load_store_interleaved_128_u64x8<S: Simd>(simd: S) {
+    let data = [1, 2, 101, 102, 201, 202, 301, 302];
+    let loaded = simd.load_interleaved_128_u64x8(&data);
+    assert_eq!(*loaded, [1, 101, 201, 301, 2, 102, 202, 302]);
+
+    let a = u64x8::from_slice(simd, &[1, 101, 201, 301, 2, 102, 202, 302]);
+    let mut dest = [0_u64; 8];
+    simd.store_interleaved_128_u64x8(a, &mut dest);
+    assert_eq!(dest, data);
+}
+
+#[simd_test]
+fn reinterpret_i64x2<S: Simd>(simd: S) {
+    let a = i64x2::from_slice(simd, &[1, -2]);
+    let bytes: u8x16<S> = a.bitcast();
+    let words: u32x4<S> = a.bitcast();
+    assert_eq!(simd.reinterpret_u8_i64x2(a).as_slice(), bytes.as_slice());
+    assert_eq!(simd.reinterpret_u32_i64x2(a).as_slice(), words.as_slice());
+}
+
+#[simd_test]
+fn reinterpret_i64x4<S: Simd>(simd: S) {
+    let a = i64x4::from_slice(simd, &[1, -2, 3, -4]);
+    let bytes: u8x32<S> = a.bitcast();
+    let words: u32x8<S> = a.bitcast();
+    assert_eq!(simd.reinterpret_u8_i64x4(a).as_slice(), bytes.as_slice());
+    assert_eq!(simd.reinterpret_u32_i64x4(a).as_slice(), words.as_slice());
+}
+
+#[simd_test]
+fn reinterpret_i64x8<S: Simd>(simd: S) {
+    let a = i64x8::from_slice(simd, &[1, -2, 3, -4, 5, -6, 7, -8]);
+    let bytes: u8x64<S> = a.bitcast();
+    let words: u32x16<S> = a.bitcast();
+    assert_eq!(simd.reinterpret_u8_i64x8(a).as_slice(), bytes.as_slice());
+    assert_eq!(simd.reinterpret_u32_i64x8(a).as_slice(), words.as_slice());
+}
+
+#[simd_test]
+fn reinterpret_u64x2<S: Simd>(simd: S) {
+    let a = u64x2::from_slice(simd, &[1, u64::MAX - 1]);
+    let bytes: u8x16<S> = a.bitcast();
+    let words: u32x4<S> = a.bitcast();
+    assert_eq!(simd.reinterpret_u8_u64x2(a).as_slice(), bytes.as_slice());
+    assert_eq!(simd.reinterpret_u32_u64x2(a).as_slice(), words.as_slice());
+}
+
+#[simd_test]
+fn reinterpret_u64x4<S: Simd>(simd: S) {
+    let a = u64x4::from_slice(simd, &[1, u64::MAX - 1, 3, u64::MAX - 3]);
+    let bytes: u8x32<S> = a.bitcast();
+    let words: u32x8<S> = a.bitcast();
+    assert_eq!(simd.reinterpret_u8_u64x4(a).as_slice(), bytes.as_slice());
+    assert_eq!(simd.reinterpret_u32_u64x4(a).as_slice(), words.as_slice());
+}
+
+#[simd_test]
+fn reinterpret_u64x8<S: Simd>(simd: S) {
+    let a = u64x8::from_slice(simd, &[1, u64::MAX - 1, 3, u64::MAX - 3, 5, 6, 7, 8]);
+    let bytes: u8x64<S> = a.bitcast();
+    let words: u32x16<S> = a.bitcast();
+    assert_eq!(simd.reinterpret_u8_u64x8(a).as_slice(), bytes.as_slice());
+    assert_eq!(simd.reinterpret_u32_u64x8(a).as_slice(), words.as_slice());
+}
+
+#[simd_test]
+fn mask64x2_ops<S: Simd>(simd: S) {
+    let t = simd.splat_mask64x2(true);
+    let f = simd.splat_mask64x2(false);
+    assert_eq!(simd.as_array_mask64x2(t), [-1, -1]);
+    assert_eq!(simd.as_array_mask64x2(f), [0, 0]);
+
+    let a = simd.load_array_mask64x2([-1, 0]);
+    let b = simd.load_array_mask64x2([0, -1]);
+    assert_eq!(simd.as_array_mask64x2(a), [-1, 0]);
+    assert_eq!(simd.as_array_mask64x2(simd.and_mask64x2(a, b)), [0, 0]);
+    assert_eq!(simd.as_array_mask64x2(simd.or_mask64x2(a, b)), [-1, -1]);
+    assert_eq!(simd.as_array_mask64x2(simd.xor_mask64x2(a, b)), [-1, -1]);
+    assert_eq!(simd.as_array_mask64x2(simd.not_mask64x2(a)), [0, -1]);
+    assert_eq!(
+        simd.as_array_mask64x2(simd.select_mask64x2(a, t, f)),
+        [-1, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x2(simd.simd_eq_mask64x2(a, a)),
+        [-1, -1]
+    );
+    assert_eq!(simd.as_array_mask64x2(simd.simd_eq_mask64x2(a, b)), [0, 0]);
+
+    let mut bitmask = simd.from_bitmask_mask64x2(0b01);
+    assert_eq!(simd.as_array_mask64x2(bitmask), [-1, 0]);
+    assert_eq!(simd.to_bitmask_mask64x2(bitmask), 0b01);
+    simd.set_mask64x2(&mut bitmask, 1, true);
+    assert_eq!(simd.to_bitmask_mask64x2(bitmask), 0b11);
+
+    assert!(simd.any_true_mask64x2(a));
+    assert!(!simd.all_true_mask64x2(a));
+    assert!(simd.any_false_mask64x2(a));
+    assert!(!simd.all_false_mask64x2(a));
+    assert!(simd.all_true_mask64x2(t));
+    assert!(simd.all_false_mask64x2(f));
+}
+
+#[simd_test]
+fn mask64x4_ops<S: Simd>(simd: S) {
+    let t = simd.splat_mask64x4(true);
+    let f = simd.splat_mask64x4(false);
+    assert_eq!(simd.as_array_mask64x4(t), [-1, -1, -1, -1]);
+    assert_eq!(simd.as_array_mask64x4(f), [0, 0, 0, 0]);
+
+    let a = simd.load_array_mask64x4([-1, 0, -1, 0]);
+    let b = simd.load_array_mask64x4([0, -1, -1, 0]);
+    assert_eq!(
+        simd.as_array_mask64x4(simd.and_mask64x4(a, b)),
+        [0, 0, -1, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x4(simd.or_mask64x4(a, b)),
+        [-1, -1, -1, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x4(simd.xor_mask64x4(a, b)),
+        [-1, -1, 0, 0]
+    );
+    assert_eq!(simd.as_array_mask64x4(simd.not_mask64x4(a)), [0, -1, 0, -1]);
+    assert_eq!(
+        simd.as_array_mask64x4(simd.select_mask64x4(a, t, f)),
+        [-1, 0, -1, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x4(simd.simd_eq_mask64x4(a, b)),
+        [0, 0, -1, -1]
+    );
+
+    let mut bitmask = simd.from_bitmask_mask64x4(0b1010);
+    assert_eq!(simd.as_array_mask64x4(bitmask), [0, -1, 0, -1]);
+    assert_eq!(simd.to_bitmask_mask64x4(bitmask), 0b1010);
+    simd.set_mask64x4(&mut bitmask, 0, true);
+    assert_eq!(simd.to_bitmask_mask64x4(bitmask), 0b1011);
+
+    let combined = simd.combine_mask64x2(
+        simd.load_array_mask64x2([-1, 0]),
+        simd.load_array_mask64x2([0, -1]),
+    );
+    assert_eq!(simd.as_array_mask64x4(combined), [-1, 0, 0, -1]);
+    let (lo, hi) = simd.split_mask64x4(combined);
+    assert_eq!(simd.as_array_mask64x2(lo), [-1, 0]);
+    assert_eq!(simd.as_array_mask64x2(hi), [0, -1]);
+
+    assert!(simd.any_true_mask64x4(a));
+    assert!(!simd.all_true_mask64x4(a));
+    assert!(simd.any_false_mask64x4(a));
+    assert!(!simd.all_false_mask64x4(a));
+    assert!(simd.all_true_mask64x4(t));
+    assert!(simd.all_false_mask64x4(f));
+}
+
+#[simd_test]
+fn mask64x8_ops<S: Simd>(simd: S) {
+    let t = simd.splat_mask64x8(true);
+    let f = simd.splat_mask64x8(false);
+    assert_eq!(simd.as_array_mask64x8(t), [-1, -1, -1, -1, -1, -1, -1, -1]);
+    assert_eq!(simd.as_array_mask64x8(f), [0, 0, 0, 0, 0, 0, 0, 0]);
+
+    let a = simd.load_array_mask64x8([-1, 0, -1, 0, -1, 0, -1, 0]);
+    let b = simd.load_array_mask64x8([0, -1, -1, 0, 0, -1, -1, 0]);
+    assert_eq!(
+        simd.as_array_mask64x8(simd.and_mask64x8(a, b)),
+        [0, 0, -1, 0, 0, 0, -1, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x8(simd.or_mask64x8(a, b)),
+        [-1, -1, -1, 0, -1, -1, -1, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x8(simd.xor_mask64x8(a, b)),
+        [-1, -1, 0, 0, -1, -1, 0, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x8(simd.not_mask64x8(a)),
+        [0, -1, 0, -1, 0, -1, 0, -1]
+    );
+    assert_eq!(
+        simd.as_array_mask64x8(simd.select_mask64x8(a, t, f)),
+        [-1, 0, -1, 0, -1, 0, -1, 0]
+    );
+    assert_eq!(
+        simd.as_array_mask64x8(simd.simd_eq_mask64x8(a, b)),
+        [0, 0, -1, -1, 0, 0, -1, -1]
+    );
+
+    let mut bitmask = simd.from_bitmask_mask64x8(0b1010_0101);
+    assert_eq!(
+        simd.as_array_mask64x8(bitmask),
+        [-1, 0, -1, 0, 0, -1, 0, -1]
+    );
+    assert_eq!(simd.to_bitmask_mask64x8(bitmask), 0b1010_0101);
+    simd.set_mask64x8(&mut bitmask, 1, true);
+    assert_eq!(simd.to_bitmask_mask64x8(bitmask), 0b1010_0111);
+
+    let combined = simd.combine_mask64x4(
+        simd.load_array_mask64x4([-1, 0, -1, 0]),
+        simd.load_array_mask64x4([0, -1, 0, -1]),
+    );
+    assert_eq!(
+        simd.as_array_mask64x8(combined),
+        [-1, 0, -1, 0, 0, -1, 0, -1]
+    );
+    let (lo, hi) = simd.split_mask64x8(combined);
+    assert_eq!(simd.as_array_mask64x4(lo), [-1, 0, -1, 0]);
+    assert_eq!(simd.as_array_mask64x4(hi), [0, -1, 0, -1]);
+
+    assert!(simd.any_true_mask64x8(a));
+    assert!(!simd.all_true_mask64x8(a));
+    assert!(simd.any_false_mask64x8(a));
+    assert!(!simd.all_false_mask64x8(a));
+    assert!(simd.all_true_mask64x8(t));
+    assert!(simd.all_false_mask64x8(f));
+}
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 57236cc26..ce22f42ce 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -11,6 +11,7 @@
 
 //! Tests for `fearless_simd`.
 
+mod int64;
 mod lm_generated;
 
 use fearless_simd::*;
diff --git a/fearless_simd_tests/tests/harness/slide_exhaustive.rs b/fearless_simd_tests/tests/harness/slide_exhaustive.rs
index f41752646..78e30d7e2 100644
--- a/fearless_simd_tests/tests/harness/slide_exhaustive.rs
+++ b/fearless_simd_tests/tests/harness/slide_exhaustive.rs
@@ -225,6 +225,8 @@ macro_rules! test_slide_exhaustive {
 // 128-bit vectors (block size == vector size, so within_blocks uses same range as vector-wide)
 test_slide_exhaustive!(slide_exhaustive_f32x4, f32x4, f32, 4, vec4, block4);
 test_slide_exhaustive!(slide_exhaustive_f64x2, f64x2, f64, 2, vec2, block2);
+test_slide_exhaustive!(slide_exhaustive_i64x2, i64x2, i64, 2, vec2, block2);
+test_slide_exhaustive!(slide_exhaustive_u64x2, u64x2, u64, 2, vec2, block2);
 test_slide_exhaustive!(slide_exhaustive_i8x16, i8x16, i8, 16, vec16, block16);
 test_slide_exhaustive!(slide_exhaustive_u8x16, u8x16, u8, 16, vec16, block16);
 test_slide_exhaustive!(slide_exhaustive_i16x8, i16x8, i16, 8, vec8, block8);
@@ -235,6 +237,8 @@ test_slide_exhaustive!(slide_exhaustive_u32x4, u32x4, u32, 4, vec4, block4);
 // 256-bit vectors (block size = 128 bits = half the vector size)
 test_slide_exhaustive!(slide_exhaustive_f32x8, f32x8, f32, 8, vec8, block4);
 test_slide_exhaustive!(slide_exhaustive_f64x4, f64x4, f64, 4, vec4, block2);
+test_slide_exhaustive!(slide_exhaustive_i64x4, i64x4, i64, 4, vec4, block2);
+test_slide_exhaustive!(slide_exhaustive_u64x4, u64x4, u64, 4, vec4, block2);
 test_slide_exhaustive!(slide_exhaustive_i8x32, i8x32, i8, 32, vec32, block16);
 test_slide_exhaustive!(slide_exhaustive_u8x32, u8x32, u8, 32, vec32, block16);
 test_slide_exhaustive!(slide_exhaustive_i16x16, i16x16, i16, 16, vec16, block8);
@@ -245,6 +249,8 @@ test_slide_exhaustive!(slide_exhaustive_u32x8, u32x8, u32, 8, vec8, block4);
 // 512-bit vectors (block size = 128 bits = quarter the vector size)
 test_slide_exhaustive!(slide_exhaustive_f32x16, f32x16, f32, 16, vec16, block4);
 test_slide_exhaustive!(slide_exhaustive_f64x8, f64x8, f64, 8, vec8, block2);
+test_slide_exhaustive!(slide_exhaustive_i64x8, i64x8, i64, 8, vec8, block2);
+test_slide_exhaustive!(slide_exhaustive_u64x8, u64x8, u64, 8, vec8, block2);
 test_slide_exhaustive!(slide_exhaustive_i8x64, i8x64, i8, 64, vec64, block16);
 test_slide_exhaustive!(slide_exhaustive_u8x64, u8x64, u8, 64, vec64, block16);
 test_slide_exhaustive!(slide_exhaustive_i16x32, i16x32, i16, 32, vec32, block8);

From 63fd005a3b6296d1386fd0355fe69064fe98f6a4 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 14:08:18 +0100
Subject: [PATCH 50/55] Placate Clippy

---
 fearless_simd/src/generated/fallback.rs | 60 ++++++++++++-------------
 fearless_simd_gen/src/generic.rs        |  7 ++-
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index f1877087d..c13e05334 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -1820,22 +1820,22 @@ impl Simd for Fallback {
     #[inline(always)]
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
         let lanes: [i8; 16usize] = [
-            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 8usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 9usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 10usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 11usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 12usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 13usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 14usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 15usize) & 1) != 0 { !0 } else { 0 },
+            if bits & 1 != 0 { !0 } else { 0 },
+            if (bits >> 1usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 2usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 3usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 4usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 5usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 6usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 7usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 8usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 9usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 10usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 11usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 12usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 13usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 14usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 15usize) & 1 != 0 { !0 } else { 0 },
         ];
         lanes.simd_into(self)
     }
@@ -3004,14 +3004,14 @@ impl Simd for Fallback {
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
         let lanes: [i16; 8usize] = [
-            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 4usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 5usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 6usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 7usize) & 1) != 0 { !0 } else { 0 },
+            if bits & 1 != 0 { !0 } else { 0 },
+            if (bits >> 1usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 2usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 3usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 4usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 5usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 6usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 7usize) & 1 != 0 { !0 } else { 0 },
         ];
         lanes.simd_into(self)
     }
@@ -3872,10 +3872,10 @@ impl Simd for Fallback {
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
         let lanes: [i32; 4usize] = [
-            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 2usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 3usize) & 1) != 0 { !0 } else { 0 },
+            if bits & 1 != 0 { !0 } else { 0 },
+            if (bits >> 1usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 2usize) & 1 != 0 { !0 } else { 0 },
+            if (bits >> 3usize) & 1 != 0 { !0 } else { 0 },
         ];
         lanes.simd_into(self)
     }
@@ -4804,8 +4804,8 @@ impl Simd for Fallback {
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
         let lanes: [i64; 2usize] = [
-            if ((bits >> 0usize) & 1) != 0 { !0 } else { 0 },
-            if ((bits >> 1usize) & 1) != 0 { !0 } else { 0 },
+            if bits & 1 != 0 { !0 } else { 0 },
+            if (bits >> 1usize) & 1 != 0 { !0 } else { 0 },
         ];
         lanes.simd_into(self)
     }
diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index f75061b8b..7a37098c9 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -542,7 +542,12 @@ pub(crate) fn generic_mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecTyp
     let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits);
     let len = vec_ty.len;
     let lanes = unrolled_array(len, |idx| {
-        quote! { if ((bits >> #idx) & 1) != 0 { !0 } else { 0 } }
+        let bit = if idx == 0 {
+            quote! { bits & 1 }
+        } else {
+            quote! { (bits >> #idx) & 1 }
+        };
+        quote! { if #bit != 0 { !0 } else { 0 } }
     });
 
     quote! {

From 47312fa458447b191714b68df9956a54cdda83c7 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 14:11:55 +0100
Subject: [PATCH 51/55] Placate Clippy some more

---
 fearless_simd_gen/src/generic.rs | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index 7a37098c9..a7c6a37cb 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -312,11 +312,8 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream {
     }
 }
 
-pub(crate) fn unrolled_array(
-    len: usize,
-    mut item: impl FnMut(usize) -> TokenStream,
-) -> TokenStream {
-    let items = (0..len).map(|idx| item(idx)).collect::<Vec<_>>();
+pub(crate) fn unrolled_array(len: usize, item: impl FnMut(usize) -> TokenStream) -> TokenStream {
+    let items = (0..len).map(item).collect::<Vec<_>>();
     quote! { [#(#items),*] }
 }
 

From 93c1cc3f8092f16cd8a31998410513275720394c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 14:16:51 +0100
Subject: [PATCH 52/55] Placate Clippy in tests

---
 fearless_simd_tests/tests/harness/int64.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/int64.rs b/fearless_simd_tests/tests/harness/int64.rs
index 59457eaf5..a5c7884ad 100644
--- a/fearless_simd_tests/tests/harness/int64.rs
+++ b/fearless_simd_tests/tests/harness/int64.rs
@@ -900,8 +900,12 @@ fn native_width_i64_u64<S: Simd>(simd: S) {
         vec![21; S::u64s::N]
     );
 
-    let i_true: Vec<i64> = (0..S::i64s::N).map(|i| -(i as i64) - 1).collect();
-    let i_false: Vec<i64> = (0..S::i64s::N).map(|i| i as i64 + 1).collect();
+    let i_true: Vec<i64> = (0..S::i64s::N)
+        .map(|i| -i64::try_from(i).expect("native vector length fits in i64") - 1)
+        .collect();
+    let i_false: Vec<i64> = (0..S::i64s::N)
+        .map(|i| i64::try_from(i).expect("native vector length fits in i64") + 1)
+        .collect();
     let i_selected = mask.select(
         S::i64s::from_slice(simd, &i_true),
         S::i64s::from_slice(simd, &i_false),

From 172f2b7922dadd9df7a1d107d0ab425a45186e22 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 14:31:53 +0100
Subject: [PATCH 53/55] Align u64 load/store interleaved with vld4/vst4
 semantics

---
 fearless_simd/src/generated/avx2.rs        |  64 +++-------
 fearless_simd/src/generated/avx512.rs      |   4 +-
 fearless_simd/src/generated/fallback.rs    |   8 +-
 fearless_simd/src/generated/simd_trait.rs  |  20 ++--
 fearless_simd/src/generated/sse4_2.rs      |  64 +++-------
 fearless_simd_gen/src/mk_fallback.rs       |  16 +--
 fearless_simd_gen/src/mk_x86.rs            | 130 ++++-----------------
 fearless_simd_gen/src/ops.rs               |  14 +--
 fearless_simd_tests/tests/harness/int64.rs |   4 +-
 9 files changed, 83 insertions(+), 241 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index e9db0d6c3..67dec71f4 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -14135,25 +14135,17 @@ impl Simd for Avx2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx2, src: &[u64; 8usize]) -> u64x8<Avx2> {
-                let (chunks, []) = src.as_chunks::<2usize>() else {
-                    unreachable!()
-                };
-                let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]);
-                let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]);
-                let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]);
-                let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]);
-                let out0 = _mm_unpacklo_epi64(v0, v1);
-                let out1 = _mm_unpacklo_epi64(v2, v3);
-                let out2 = _mm_unpackhi_epi64(v0, v1);
-                let out3 = _mm_unpackhi_epi64(v2, v3);
-                token.combine_u64x4(
-                    token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)),
-                )
+                [
+                    src[0usize],
+                    src[4usize],
+                    src[1usize],
+                    src[5usize],
+                    src[2usize],
+                    src[6usize],
+                    src[3usize],
+                    src[7usize],
+                ]
+                .simd_into(token)
             }
         );
         kernel(self, src)
@@ -14163,36 +14155,10 @@ impl Simd for Avx2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx2, a: u64x8<Avx2>, dest: &mut [u64; 8usize]) -> () {
-                let (v01, v23) = token.split_u64x8(a);
-                let (v0, v1) = token.split_u64x4(v01);
-                let (v2, v3) = token.split_u64x4(v23);
-                let v0 = v0.into();
-                let v1 = v1.into();
-                let v2 = v2.into();
-                let v3 = v3.into();
-                let out0 = _mm_unpacklo_epi64(v0, v2);
-                let out1 = _mm_unpackhi_epi64(v0, v2);
-                let out2 = _mm_unpacklo_epi64(v1, v3);
-                let out3 = _mm_unpackhi_epi64(v1, v3);
-                let (chunks, []) = dest.as_chunks_mut::<2usize>() else {
-                    unreachable!()
-                };
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out0,
-                    &mut chunks[0],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out1,
-                    &mut chunks[1],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out2,
-                    &mut chunks[2],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out3,
-                    &mut chunks[3],
-                );
+                *dest = [
+                    a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize],
+                    a[7usize],
+                ];
             }
         );
         kernel(self, a, dest);
diff --git a/fearless_simd/src/generated/avx512.rs b/fearless_simd/src/generated/avx512.rs
index 1a6ff0288..77e3e5025 100644
--- a/fearless_simd/src/generated/avx512.rs
+++ b/fearless_simd/src/generated/avx512.rs
@@ -16121,7 +16121,7 @@ impl Simd for Avx512 {
             fn kernel(token: Avx512, src: &[u64; 8usize]) -> u64x8<Avx512> {
                 let lanes: __m512i =
                     crate::transmute::checked_transmute_copy::<[u64; 8usize], __m512i>(src);
-                _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), lanes)
+                _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7), lanes)
                     .simd_into(token)
             }
         );
@@ -16133,7 +16133,7 @@ impl Simd for Avx512 {
             #[inline(always)]
             fn kernel(token: Avx512, a: u64x8<Avx512>, dest: &mut [u64; 8usize]) -> () {
                 let lanes =
-                    _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7), a.into());
+                    _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), a.into());
                 crate::transmute::checked_transmute_store::<__m512i, [u64; 8usize]>(lanes, dest);
             }
         );
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index c13e05334..89393c87c 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -11404,12 +11404,12 @@ impl Simd for Fallback {
     fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self> {
         [
             src[0usize],
-            src[2usize],
             src[4usize],
-            src[6usize],
             src[1usize],
-            src[3usize],
             src[5usize],
+            src[2usize],
+            src[6usize],
+            src[3usize],
             src[7usize],
         ]
         .simd_into(self)
@@ -11417,7 +11417,7 @@ impl Simd for Fallback {
     #[inline(always)]
     fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> () {
         *dest = [
-            a[0usize], a[4usize], a[1usize], a[5usize], a[2usize], a[6usize], a[3usize], a[7usize],
+            a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize], a[7usize],
         ];
     }
     #[inline(always)]
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 08d5af348..43ee9123b 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -2312,9 +2312,9 @@ pub trait Simd:
     fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `i32` elements.\n\nThis is a bitwise reinterpretation only, and does not perform any conversions."]
     fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self>;
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> ();
     #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self>;
@@ -2490,9 +2490,9 @@ pub trait Simd:
     fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self>;
     #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
     fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>);
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> ();
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self>;
@@ -2697,9 +2697,9 @@ pub trait Simd:
     fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self>;
     #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
     fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>);
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> ();
     #[doc = "Truncate each element to a narrower integer type.\n\nThe number of elements in the result is twice that of the input."]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self>;
@@ -2910,9 +2910,9 @@ pub trait Simd:
     fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self>;
     #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
     fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>);
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> ();
     #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self>;
@@ -3213,9 +3213,9 @@ pub trait Simd:
     fn max_u64x8(self, a: u64x8<Self>, b: u64x8<Self>) -> u64x8<Self>;
     #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
     fn split_u64x8(self, a: u64x8<Self>) -> (u64x4<Self>, u64x4<Self>);
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn load_interleaved_128_u64x8(self, src: &[u64; 8usize]) -> u64x8<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit vectors into lane-interleaved memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn store_interleaved_128_u64x8(self, a: u64x8<Self>, dest: &mut [u64; 8usize]) -> ();
     #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u8_u64x8(self, a: u64x8<Self>) -> u8x64<Self>;
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index 264c6990b..cfc2f7b40 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -11787,25 +11787,17 @@ impl Simd for Sse4_2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Sse4_2, src: &[u64; 8usize]) -> u64x8<Sse4_2> {
-                let (chunks, []) = src.as_chunks::<2usize>() else {
-                    unreachable!()
-                };
-                let v0: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]);
-                let v1: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]);
-                let v2: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]);
-                let v3: __m128i =
-                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]);
-                let out0 = _mm_unpacklo_epi64(v0, v1);
-                let out1 = _mm_unpacklo_epi64(v2, v3);
-                let out2 = _mm_unpackhi_epi64(v0, v1);
-                let out3 = _mm_unpackhi_epi64(v2, v3);
-                token.combine_u64x4(
-                    token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)),
-                    token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)),
-                )
+                [
+                    src[0usize],
+                    src[4usize],
+                    src[1usize],
+                    src[5usize],
+                    src[2usize],
+                    src[6usize],
+                    src[3usize],
+                    src[7usize],
+                ]
+                .simd_into(token)
             }
         );
         kernel(self, src)
@@ -11815,36 +11807,10 @@ impl Simd for Sse4_2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Sse4_2, a: u64x8<Sse4_2>, dest: &mut [u64; 8usize]) -> () {
-                let (v01, v23) = token.split_u64x8(a);
-                let (v0, v1) = token.split_u64x4(v01);
-                let (v2, v3) = token.split_u64x4(v23);
-                let v0 = v0.into();
-                let v1 = v1.into();
-                let v2 = v2.into();
-                let v3 = v3.into();
-                let out0 = _mm_unpacklo_epi64(v0, v2);
-                let out1 = _mm_unpackhi_epi64(v0, v2);
-                let out2 = _mm_unpacklo_epi64(v1, v3);
-                let out3 = _mm_unpackhi_epi64(v1, v3);
-                let (chunks, []) = dest.as_chunks_mut::<2usize>() else {
-                    unreachable!()
-                };
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out0,
-                    &mut chunks[0],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out1,
-                    &mut chunks[1],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out2,
-                    &mut chunks[2],
-                );
-                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
-                    out3,
-                    &mut chunks[3],
-                );
+                *dest = [
+                    a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize],
+                    a[7usize],
+                ];
             }
         );
         kernel(self, a, dest);
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index 810baa9e1..92099258a 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -472,12 +472,8 @@ impl Level for Fallback {
                 block_count,
             } => {
                 let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
-                let stride = if vec_ty.scalar_bits == 64 {
-                    len / block_count as usize
-                } else {
-                    block_count as usize
-                };
-                let items = interleave_indices(len, stride, |idx| quote! { src[#idx] });
+                let items =
+                    interleave_indices(len, block_count as usize, |idx| quote! { src[#idx] });
 
                 quote! {
                     #method_sig {
@@ -490,12 +486,8 @@ impl Level for Fallback {
                 block_count,
             } => {
                 let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
-                let stride = if vec_ty.scalar_bits == 64 {
-                    block_count as usize
-                } else {
-                    len / block_count as usize
-                };
-                let items = interleave_indices(len, stride, |idx| quote! { a[#idx] });
+                let items =
+                    interleave_indices(len, len / block_count as usize, |idx| quote! { a[#idx] });
 
                 quote! {
                     #method_sig {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 84f7a9a1d..b7e7cd799 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -10,6 +10,7 @@ use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
     generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
     integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare, scalar_shift,
+    unrolled_array,
 };
 use crate::level::Level;
 use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret};
@@ -911,20 +912,6 @@ fn interleaved_store_indices(len: usize, block_count: usize) -> Vec<usize> {
         .collect()
 }
 
-fn interleaved_load_indices_64(len: usize, block_count: usize) -> Vec<usize> {
-    let stream_len = len / block_count;
-    (0..stream_len)
-        .flat_map(|i| (0..block_count).map(move |stream| stream * stream_len + i))
-        .collect()
-}
-
-fn interleaved_store_indices_64(len: usize, block_count: usize) -> Vec<usize> {
-    let stream_len = len / block_count;
-    (0..block_count)
-        .flat_map(|stream| (0..stream_len).map(move |i| i * block_count + stream))
-        .collect()
-}
-
 impl X86 {
     pub(crate) fn handle_splat(&self, op: Op, vec_ty: &VecType) -> TokenStream {
         if *self == Self::Avx512 && vec_ty.scalar == ScalarType::Mask {
@@ -2940,51 +2927,16 @@ impl X86 {
         }
         match vec_ty.scalar_bits {
             64 => {
-                let block_ty =
-                    VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
-                let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
-                let native_ty = self.arch_ty(&block_ty);
-                let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty);
-                let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty);
-                let vec_combined =
-                    VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2);
-                let combine_half = Ident::new(
-                    &format!("combine_{}", block_ty.rust_name()),
-                    Span::call_site(),
-                );
-                let combine_full = Ident::new(
-                    &format!("combine_{}", vec_combined.rust_name()),
-                    Span::call_site(),
-                );
-                let block_len = block_size as usize / vec_ty.scalar_bits;
+                let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
+                let indices = interleaved_load_indices(len, block_count as usize);
+                let items = unrolled_array(len, |idx| {
+                    let src_idx = indices[idx];
+                    quote! { src[#src_idx] }
+                });
 
                 self.kernel_method(op, vec_ty, |token| {
                     quote! {
-                        let (chunks, []) = src.as_chunks::<#block_len>() else {
-                            unreachable!()
-                        };
-                        let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                            &chunks[0],
-                        );
-                        let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                            &chunks[1],
-                        );
-                        let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                            &chunks[2],
-                        );
-                        let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                            &chunks[3],
-                        );
-
-                        let out0 = #unpacklo_64(v0, v1);
-                        let out1 = #unpacklo_64(v2, v3);
-                        let out2 = #unpackhi_64(v0, v1);
-                        let out3 = #unpackhi_64(v2, v3);
-
-                        #token.#combine_full(
-                            #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)),
-                            #token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)),
-                        )
+                        #items.simd_into(#token)
                     }
                 })
             }
@@ -3131,12 +3083,10 @@ impl X86 {
         let native_ty = self.arch_ty(vec_ty);
         let len = vec_ty.len;
         let permute = avx512_permutexvar_intrinsic(vec_ty);
-        let indices = if vec_ty.scalar_bits == 64 {
-            interleaved_load_indices_64(vec_ty.len, block_count as usize)
-        } else {
-            interleaved_load_indices(vec_ty.len, block_count as usize)
-        };
-        let indices = avx512_index_vector(vec_ty, indices);
+        let indices = avx512_index_vector(
+            vec_ty,
+            interleaved_load_indices(vec_ty.len, block_count as usize),
+        );
 
         self.kernel_method(op, vec_ty, |token| {
             quote! {
@@ -3166,46 +3116,16 @@ impl X86 {
         }
         match vec_ty.scalar_bits {
             64 => {
-                let block_ty =
-                    VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
-                let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
-                let native_ty = self.arch_ty(&block_ty);
-                let unpacklo_64 = simple_sign_unaware_intrinsic("unpacklo", &block_ty);
-                let unpackhi_64 = simple_sign_unaware_intrinsic("unpackhi", &block_ty);
-
-                let vec_combined =
-                    VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2);
-                let split_half = Ident::new(
-                    &format!("split_{}", vec_combined.rust_name()),
-                    Span::call_site(),
-                );
-                let split_full =
-                    Ident::new(&format!("split_{}", vec_ty.rust_name()), Span::call_site());
-                let block_len = block_size as usize / vec_ty.scalar_bits;
+                let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
+                let indices = interleaved_store_indices(len, block_count as usize);
+                let items = unrolled_array(len, |idx| {
+                    let lane_idx = indices[idx];
+                    quote! { a[#lane_idx] }
+                });
 
-                self.kernel_method(op, vec_ty, |token| {
+                self.kernel_method(op, vec_ty, |_| {
                     quote! {
-                        let (v01, v23) = #token.#split_full(a);
-                        let (v0, v1) = #token.#split_half(v01);
-                        let (v2, v3) = #token.#split_half(v23);
-                        let v0 = v0.into();
-                        let v1 = v1.into();
-                        let v2 = v2.into();
-                        let v3 = v3.into();
-
-                        let out0 = #unpacklo_64(v0, v2);
-                        let out1 = #unpackhi_64(v0, v2);
-                        let out2 = #unpacklo_64(v1, v3);
-                        let out3 = #unpackhi_64(v1, v3);
-
-                        let (chunks, []) = dest.as_chunks_mut::<#block_len>() else {
-                            unreachable!()
-                        };
-
-                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]);
-                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]);
-                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]);
-                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]);
+                        *dest = #items;
                     }
                 })
             }
@@ -3346,12 +3266,10 @@ impl X86 {
         let native_ty = self.arch_ty(vec_ty);
         let len = vec_ty.len;
         let permute = avx512_permutexvar_intrinsic(vec_ty);
-        let indices = if vec_ty.scalar_bits == 64 {
-            interleaved_store_indices_64(vec_ty.len, block_count as usize)
-        } else {
-            interleaved_store_indices(vec_ty.len, block_count as usize)
-        };
-        let indices = avx512_index_vector(vec_ty, indices);
+        let indices = avx512_index_vector(
+            vec_ty,
+            interleaved_store_indices(vec_ty.len, block_count as usize),
+        );
 
         self.kernel_method(op, vec_ty, |_| {
             quote! {
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 17495b41b..9dcb1bcfe 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -1367,11 +1367,11 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
             },
             "Load elements from an array with 4-way interleaving.\n\n\
             This is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded \
-            vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks \
+            vectors, while this operation treats memory as four interleaved 128-bit vectors and deinterleaves them \
             into one vector.\n\n\
             For example, with 32-bit lanes, memory laid out as \
-            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as \
-            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.",
+            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` loads as \
+            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.",
         ));
     }
 
@@ -1385,11 +1385,11 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
             },
             "Store elements to an array with 4-way interleaving.\n\n\
             This is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: \
-            `interleave` combines two already-loaded vectors, while this operation transposes one vector into four \
-            consecutive 128-bit blocks in memory.\n\n\
+            `interleave` combines two already-loaded vectors, while this operation stores four consecutive 128-bit \
+            vectors into lane-interleaved memory.\n\n\
             For example, with 32-bit lanes, a vector containing \
-            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as \
-            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.",
+            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` stores as \
+            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.",
         ));
     }
 
diff --git a/fearless_simd_tests/tests/harness/int64.rs b/fearless_simd_tests/tests/harness/int64.rs
index a5c7884ad..10a851604 100644
--- a/fearless_simd_tests/tests/harness/int64.rs
+++ b/fearless_simd_tests/tests/harness/int64.rs
@@ -1250,9 +1250,9 @@ fn interleave_deinterleave_u64x8<S: Simd>(simd: S) {
 fn load_store_interleaved_128_u64x8<S: Simd>(simd: S) {
     let data = [1, 2, 101, 102, 201, 202, 301, 302];
     let loaded = simd.load_interleaved_128_u64x8(&data);
-    assert_eq!(*loaded, [1, 101, 201, 301, 2, 102, 202, 302]);
+    assert_eq!(*loaded, [1, 201, 2, 202, 101, 301, 102, 302]);
 
-    let a = u64x8::from_slice(simd, &[1, 101, 201, 301, 2, 102, 202, 302]);
+    let a = u64x8::from_slice(simd, &[1, 201, 2, 202, 101, 301, 102, 302]);
     let mut dest = [0_u64; 8];
     simd.store_interleaved_128_u64x8(a, &mut dest);
     assert_eq!(dest, data);

From 7602a4208488e860a3a230d852df9c2f7608f848 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 14:54:38 +0100
Subject: [PATCH 54/55] Emit optimized implementations for
 load/store_interleaved on sse4.2 and avx2

---
 fearless_simd/src/generated/avx2.rs   |  45 +++--
 fearless_simd/src/generated/sse4_2.rs |  64 +++++--
 fearless_simd_gen/src/mk_x86.rs       | 235 +++++++++++++++-----------
 3 files changed, 219 insertions(+), 125 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 67dec71f4..d89d9f0f2 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -14135,17 +14135,18 @@ impl Simd for Avx2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx2, src: &[u64; 8usize]) -> u64x8<Avx2> {
-                [
-                    src[0usize],
-                    src[4usize],
-                    src[1usize],
-                    src[5usize],
-                    src[2usize],
-                    src[6usize],
-                    src[3usize],
-                    src[7usize],
-                ]
-                .simd_into(token)
+                let (chunks, []) = src.as_chunks::<4>() else {
+                    unreachable!()
+                };
+                let v0: __m256i =
+                    crate::transmute::checked_transmute_copy::<[u64; 4], __m256i>(&chunks[0]);
+                let v1: __m256i =
+                    crate::transmute::checked_transmute_copy::<[u64; 4], __m256i>(&chunks[1]);
+                let lo = _mm256_unpacklo_epi64(v0, v1);
+                let hi = _mm256_unpackhi_epi64(v0, v1);
+                let out0 = _mm256_permute2x128_si256::<0x20>(lo, hi);
+                let out1 = _mm256_permute2x128_si256::<0x31>(lo, hi);
+                token.combine_u64x4(out0.simd_into(token), out1.simd_into(token))
             }
         );
         kernel(self, src)
@@ -14155,10 +14156,24 @@ impl Simd for Avx2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Avx2, a: u64x8<Avx2>, dest: &mut [u64; 8usize]) -> () {
-                *dest = [
-                    a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize],
-                    a[7usize],
-                ];
+                let (v0, v1) = token.split_u64x8(a);
+                let v0: __m256i = v0.into();
+                let v1: __m256i = v1.into();
+                let lo = _mm256_permute2x128_si256::<0x20>(v0, v1);
+                let hi = _mm256_permute2x128_si256::<0x31>(v0, v1);
+                let out0 = _mm256_unpacklo_epi64(lo, hi);
+                let out1 = _mm256_unpackhi_epi64(lo, hi);
+                let (chunks, []) = dest.as_chunks_mut::<4>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m256i, [u64; 4]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m256i, [u64; 4]>(
+                    out1,
+                    &mut chunks[1],
+                );
             }
         );
         kernel(self, a, dest);
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index cfc2f7b40..d61ce565a 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -11787,17 +11787,25 @@ impl Simd for Sse4_2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Sse4_2, src: &[u64; 8usize]) -> u64x8<Sse4_2> {
-                [
-                    src[0usize],
-                    src[4usize],
-                    src[1usize],
-                    src[5usize],
-                    src[2usize],
-                    src[6usize],
-                    src[3usize],
-                    src[7usize],
-                ]
-                .simd_into(token)
+                let (chunks, []) = src.as_chunks::<2usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u64; 2usize], __m128i>(&chunks[3]);
+                let out0 = _mm_unpacklo_epi64(v0, v2);
+                let out1 = _mm_unpackhi_epi64(v0, v2);
+                let out2 = _mm_unpacklo_epi64(v1, v3);
+                let out3 = _mm_unpackhi_epi64(v1, v3);
+                token.combine_u64x4(
+                    token.combine_u64x2(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u64x2(out2.simd_into(token), out3.simd_into(token)),
+                )
             }
         );
         kernel(self, src)
@@ -11807,10 +11815,36 @@ impl Simd for Sse4_2 {
         crate::kernel!(
             #[inline(always)]
             fn kernel(token: Sse4_2, a: u64x8<Sse4_2>, dest: &mut [u64; 8usize]) -> () {
-                *dest = [
-                    a[0usize], a[2usize], a[4usize], a[6usize], a[1usize], a[3usize], a[5usize],
-                    a[7usize],
-                ];
+                let (v01, v23) = token.split_u64x8(a);
+                let (v0, v1) = token.split_u64x4(v01);
+                let (v2, v3) = token.split_u64x4(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let out0 = _mm_unpacklo_epi64(v0, v1);
+                let out1 = _mm_unpacklo_epi64(v2, v3);
+                let out2 = _mm_unpackhi_epi64(v0, v1);
+                let out3 = _mm_unpackhi_epi64(v2, v3);
+                let (chunks, []) = dest.as_chunks_mut::<2usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u64; 2usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
             }
         );
         kernel(self, a, dest);
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index b7e7cd799..9896ba5bb 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -10,7 +10,6 @@ use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
     generic_from_bytes, generic_mask_set, generic_op_name, generic_store_array, generic_to_bytes,
     integer_lane_mask_splat_arg, scalar_binary, scalar_binary_method, scalar_compare, scalar_shift,
-    unrolled_array,
 };
 use crate::level::Level;
 use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret};
@@ -2926,23 +2925,14 @@ impl X86 {
             return self.handle_avx512_load_interleaved(op, vec_ty, block_size, block_count);
         }
         match vec_ty.scalar_bits {
-            64 => {
-                let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
-                let indices = interleaved_load_indices(len, block_count as usize);
-                let items = unrolled_array(len, |idx| {
-                    let src_idx = indices[idx];
-                    quote! { src[#src_idx] }
-                });
-
-                self.kernel_method(op, vec_ty, |token| {
-                    quote! {
-                        #items.simd_into(#token)
-                    }
-                })
-            }
-            32 | 16 | 8 => {
-                let block_ty =
-                    VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
+            64 | 32 | 16 | 8 => {
+                let avx2_u64 = *self == Self::Avx2 && vec_ty.scalar_bits == 64;
+                let block_len = if avx2_u64 {
+                    4
+                } else {
+                    block_size as usize / vec_ty.scalar_bits
+                };
+                let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, block_len);
                 let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
                 let native_ty = self.arch_ty(&block_ty);
                 let vec_32 = block_ty.reinterpret(block_ty.scalar, 32);
@@ -2962,7 +2952,24 @@ impl X86 {
                     &format!("combine_{}", vec_combined.rust_name()),
                     Span::call_site(),
                 );
-                let block_len = block_size as usize / vec_ty.scalar_bits;
+                if avx2_u64 {
+                    return self.kernel_method(op, vec_ty, |token| {
+                        quote! {
+                            let (chunks, []) = src.as_chunks::<4>() else {
+                                unreachable!()
+                            };
+                            let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; 4], #native_ty>(&chunks[0]);
+                            let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; 4], #native_ty>(&chunks[1]);
+
+                            let lo = #unpacklo_64(v0, v1); // [0,4,2,6]
+                            let hi = #unpackhi_64(v0, v1); // [1,5,3,7]
+                            let out0 = _mm256_permute2x128_si256::<0x20>(lo, hi); // [0,4,1,5]
+                            let out1 = _mm256_permute2x128_si256::<0x31>(lo, hi); // [2,6,3,7]
+
+                            #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token))
+                        }
+                    });
+                }
 
                 let init_shuffle = match vec_ty.scalar_bits {
                     16 => Some(quote! {
@@ -2992,36 +2999,53 @@ impl X86 {
                     _ => None,
                 };
 
-                let final_unpack = if vec_ty.scalar == ScalarType::Float && vec_ty.scalar_bits == 32
-                {
-                    let cast_32 = cast_ident(
-                        ScalarType::Float,
-                        ScalarType::Float,
-                        64,
-                        32,
-                        block_ty.n_bits(),
-                    );
-                    let cast_64 = cast_ident(
-                        ScalarType::Float,
-                        ScalarType::Float,
-                        32,
-                        64,
-                        block_ty.n_bits(),
-                    );
+                let initial_unpack = if vec_ty.scalar_bits == 64 {
+                    None
+                } else {
+                    Some(quote! {
+                        let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
+                        let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7]
+                        let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13]
+                        let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15]
+                    })
+                };
 
-                    quote! {
-                        let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12]
-                        let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13]
-                        let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14]
-                        let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15]
+                let final_unpack = match (vec_ty.scalar, vec_ty.scalar_bits) {
+                    (_, 64) => quote! {
+                        let out0 = #unpacklo_64(v0, v2); // [0,4]
+                        let out1 = #unpackhi_64(v0, v2); // [1,5]
+                        let out2 = #unpacklo_64(v1, v3); // [2,6]
+                        let out3 = #unpackhi_64(v1, v3); // [3,7]
+                    },
+                    (ScalarType::Float, 32) => {
+                        let cast_32 = cast_ident(
+                            ScalarType::Float,
+                            ScalarType::Float,
+                            64,
+                            32,
+                            block_ty.n_bits(),
+                        );
+                        let cast_64 = cast_ident(
+                            ScalarType::Float,
+                            ScalarType::Float,
+                            32,
+                            64,
+                            block_ty.n_bits(),
+                        );
+
+                        quote! {
+                            let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12]
+                            let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13]
+                            let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14]
+                            let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15]
+                        }
                     }
-                } else {
-                    quote! {
+                    _ => quote! {
                         let out0 = #unpacklo_64(tmp0, tmp2); // [0,4,8,12]
                         let out1 = #unpackhi_64(tmp0, tmp2); // [1,5,9,13]
                         let out2 = #unpacklo_64(tmp1, tmp3); // [2,6,10,14]
                         let out3 = #unpackhi_64(tmp1, tmp3); // [3,7,11,15]
-                    }
+                    },
                 };
 
                 self.kernel_method(op, vec_ty, |token| {
@@ -3044,11 +3068,7 @@ impl X86 {
 
                         #init_shuffle
 
-                        let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
-                        let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7]
-                        let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13]
-                        let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15]
-
+                        #initial_unpack
                         #final_unpack
 
                         #token.#combine_full(
@@ -3115,23 +3135,14 @@ impl X86 {
             return self.handle_avx512_store_interleaved(op, vec_ty, block_size, block_count);
         }
         match vec_ty.scalar_bits {
-            64 => {
-                let len = (block_size * block_count) as usize / vec_ty.scalar_bits;
-                let indices = interleaved_store_indices(len, block_count as usize);
-                let items = unrolled_array(len, |idx| {
-                    let lane_idx = indices[idx];
-                    quote! { a[#lane_idx] }
-                });
-
-                self.kernel_method(op, vec_ty, |_| {
-                    quote! {
-                        *dest = #items;
-                    }
-                })
-            }
-            32 | 16 | 8 => {
-                let block_ty =
-                    VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
+            64 | 32 | 16 | 8 => {
+                let avx2_u64 = *self == Self::Avx2 && vec_ty.scalar_bits == 64;
+                let block_len = if avx2_u64 {
+                    4
+                } else {
+                    block_size as usize / vec_ty.scalar_bits
+                };
+                let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, block_len);
                 let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
                 let native_ty = self.arch_ty(&block_ty);
                 let vec_32 = block_ty.reinterpret(block_ty.scalar, 32);
@@ -3149,7 +3160,28 @@ impl X86 {
                 );
                 let split_full =
                     Ident::new(&format!("split_{}", vec_ty.rust_name()), Span::call_site());
-                let block_len = block_size as usize / vec_ty.scalar_bits;
+
+                if avx2_u64 {
+                    return self.kernel_method(op, vec_ty, |token| {
+                        quote! {
+                            let (v0, v1) = #token.#split_full(a);
+                            let v0: #native_ty = v0.into();
+                            let v1: #native_ty = v1.into();
+
+                            let lo = _mm256_permute2x128_si256::<0x20>(v0, v1); // [0,4,2,6]
+                            let hi = _mm256_permute2x128_si256::<0x31>(v0, v1); // [1,5,3,7]
+                            let out0 = #unpacklo_64(lo, hi); // [0,1,2,3]
+                            let out1 = #unpackhi_64(lo, hi); // [4,5,6,7]
+
+                            let (chunks, []) = dest.as_chunks_mut::<4>() else {
+                                unreachable!()
+                            };
+
+                            crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; 4]>(out0, &mut chunks[0]);
+                            crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; 4]>(out1, &mut chunks[1]);
+                        }
+                    });
+                }
 
                 let post_shuffle = match vec_ty.scalar_bits {
                     16 => Some(quote! {
@@ -3179,36 +3211,53 @@ impl X86 {
                     _ => None,
                 };
 
-                let final_unpack = if vec_ty.scalar == ScalarType::Float && vec_ty.scalar_bits == 32
-                {
-                    let cast_32 = cast_ident(
-                        ScalarType::Float,
-                        ScalarType::Float,
-                        64,
-                        32,
-                        block_ty.n_bits(),
-                    );
-                    let cast_64 = cast_ident(
-                        ScalarType::Float,
-                        ScalarType::Float,
-                        32,
-                        64,
-                        block_ty.n_bits(),
-                    );
+                let initial_unpack = if vec_ty.scalar_bits == 64 {
+                    None
+                } else {
+                    Some(quote! {
+                        let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
+                        let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7]
+                        let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13]
+                        let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15]
+                    })
+                };
 
-                    quote! {
-                        let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12]
-                        let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13]
-                        let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14]
-                        let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15]
+                let final_unpack = match (vec_ty.scalar, vec_ty.scalar_bits) {
+                    (_, 64) => quote! {
+                        let out0 = #unpacklo_64(v0, v1); // [0,1]
+                        let out1 = #unpacklo_64(v2, v3); // [2,3]
+                        let out2 = #unpackhi_64(v0, v1); // [4,5]
+                        let out3 = #unpackhi_64(v2, v3); // [6,7]
+                    },
+                    (ScalarType::Float, 32) => {
+                        let cast_32 = cast_ident(
+                            ScalarType::Float,
+                            ScalarType::Float,
+                            64,
+                            32,
+                            block_ty.n_bits(),
+                        );
+                        let cast_64 = cast_ident(
+                            ScalarType::Float,
+                            ScalarType::Float,
+                            32,
+                            64,
+                            block_ty.n_bits(),
+                        );
+
+                        quote! {
+                            let out0 = #cast_32(#unpacklo_64(#cast_64(tmp0), #cast_64(tmp2))); // [0,4,8,12]
+                            let out1 = #cast_32(#unpackhi_64(#cast_64(tmp0), #cast_64(tmp2))); // [1,5,9,13]
+                            let out2 = #cast_32(#unpacklo_64(#cast_64(tmp1), #cast_64(tmp3))); // [2,6,10,14]
+                            let out3 = #cast_32(#unpackhi_64(#cast_64(tmp1), #cast_64(tmp3))); // [3,7,11,15]
+                        }
                     }
-                } else {
-                    quote! {
+                    _ => quote! {
                         let out0 = #unpacklo_64(tmp0, tmp2); // [0,4,8,12]
                         let out1 = #unpackhi_64(tmp0, tmp2); // [1,5,9,13]
                         let out2 = #unpacklo_64(tmp1, tmp3); // [2,6,10,14]
                         let out3 = #unpackhi_64(tmp1, tmp3); // [3,7,11,15]
-                    }
+                    },
                 };
 
                 self.kernel_method(op, vec_ty, |token| {
@@ -3221,11 +3270,7 @@ impl X86 {
                         let v2 = v2.into();
                         let v3 = v3.into();
 
-                        let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
-                        let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7]
-                        let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13]
-                        let tmp3 = #unpackhi_32(v2, v3); // [10,14,11,15]
-
+                        #initial_unpack
                         #final_unpack
 
                         #post_shuffle

From d5fae13902744d677c5777df16416590fc3c4295 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 23 Jun 2026 15:05:17 +0100
Subject: [PATCH 55/55] Realign WASM load/store_interleaved impls with
 vld4/vst4 semantics

---
 fearless_simd/src/generated/wasm.rs |  20 +++---
 fearless_simd_gen/src/mk_wasm.rs    | 104 ++++++++++++----------------
 2 files changed, 54 insertions(+), 70 deletions(-)

diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 2c66ee1e1..2cab703e4 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -9696,14 +9696,10 @@ impl Simd for WasmSimd128 {
         let v1: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[1]);
         let v2: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[2]);
         let v3: v128 = crate::transmute::checked_transmute_copy::<[u64; 2usize], v128>(&chunks[3]);
-        let v01_lower = u64x2_shuffle::<0, 2>(v0, v1);
-        let v23_lower = u64x2_shuffle::<0, 2>(v2, v3);
-        let v01_upper = u64x2_shuffle::<1, 3>(v0, v1);
-        let v23_upper = u64x2_shuffle::<1, 3>(v2, v3);
-        let out0 = u64x2_shuffle::<0, 1>(v01_lower, v23_lower);
-        let out1 = u64x2_shuffle::<2, 3>(v01_lower, v23_lower);
-        let out2 = u64x2_shuffle::<0, 1>(v01_upper, v23_upper);
-        let out3 = u64x2_shuffle::<2, 3>(v01_upper, v23_upper);
+        let out0 = u64x2_shuffle::<0, 2>(v0, v2);
+        let out1 = u64x2_shuffle::<1, 3>(v0, v2);
+        let out2 = u64x2_shuffle::<0, 2>(v1, v3);
+        let out3 = u64x2_shuffle::<1, 3>(v1, v3);
         let combined_lower = self.combine_u64x2(out0.simd_into(self), out1.simd_into(self));
         let combined_upper = self.combine_u64x2(out2.simd_into(self), out3.simd_into(self));
         self.combine_u64x4(combined_lower, combined_upper)
@@ -9717,10 +9713,10 @@ impl Simd for WasmSimd128 {
         let v1: v128 = v1_vec.into();
         let v2: v128 = v2_vec.into();
         let v3: v128 = v3_vec.into();
-        let out0 = u64x2_shuffle::<0, 2>(v0, v2);
-        let out1 = u64x2_shuffle::<1, 3>(v0, v2);
-        let out2 = u64x2_shuffle::<0, 2>(v1, v3);
-        let out3 = u64x2_shuffle::<1, 3>(v1, v3);
+        let out0 = u64x2_shuffle::<0, 2>(v0, v1);
+        let out1 = u64x2_shuffle::<0, 2>(v2, v3);
+        let out2 = u64x2_shuffle::<1, 3>(v0, v1);
+        let out3 = u64x2_shuffle::<1, 3>(v2, v3);
         let (chunks, []) = dest.as_chunks_mut::<2usize>() else {
             unreachable!()
         };
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 11efb293b..9a53d5b61 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -640,8 +640,8 @@ impl Level for WasmSimd128 {
                     64 => (
                         quote! { 0, 2 },
                         quote! { 1, 3 },
-                        quote! { 0, 1 },
-                        quote! { 2, 3 },
+                        quote! { 0, 2 },
+                        quote! { 1, 3 },
                         quote! { u64x2_shuffle },
                     ),
                     _ => panic!("unsupported scalar_bits"),
@@ -660,6 +660,31 @@ impl Level for WasmSimd128 {
                     self.#combine_method_2x(combined_lower, combined_upper)
                 };
 
+                let shuffle_code = if vec_ty.scalar_bits == 64 {
+                    quote! {
+                        let out0 = #shuffle_fn::<#i1>(v0, v2);
+                        let out1 = #shuffle_fn::<#i2>(v0, v2);
+                        let out2 = #shuffle_fn::<#i1>(v1, v3);
+                        let out3 = #shuffle_fn::<#i2>(v1, v3);
+                    }
+                } else {
+                    quote! {
+                        // InterleaveLowerLanes(v0, v1) and InterleaveLowerLanes(v2, v3)
+                        let v01_lower = #shuffle_fn::<#i1>(v0, v1);
+                        let v23_lower = #shuffle_fn::<#i1>(v2, v3);
+
+                        // InterleaveUpperLanes(v0, v1) and InterleaveUpperLanes(v2, v3)
+                        let v01_upper = #shuffle_fn::<#i2>(v0, v1);
+                        let v23_upper = #shuffle_fn::<#i2>(v2, v3);
+
+                        // Interleave lower and upper to get final result
+                        let out0 = #shuffle_fn::<#i3>(v01_lower, v23_lower);
+                        let out1 = #shuffle_fn::<#i4>(v01_lower, v23_lower);
+                        let out2 = #shuffle_fn::<#i3>(v01_upper, v23_upper);
+                        let out3 = #shuffle_fn::<#i4>(v01_upper, v23_upper);
+                    }
+                };
+
                 quote! {
                     #method_sig {
                             let (chunks, []) = src.as_chunks::<#elems_per_vec>() else {
@@ -678,20 +703,7 @@ impl Level for WasmSimd128 {
                                 &chunks[3],
                             );
 
-                            // InterleaveLowerLanes(v0, v2) and InterleaveLowerLanes(v1, v3)
-                            let v01_lower = #shuffle_fn::<#i1>(v0, v1);
-                            let v23_lower = #shuffle_fn::<#i1>(v2, v3);
-
-                            // InterleaveUpperLanes(v0, v2) and InterleaveUpperLanes(v1, v3)
-                            let v01_upper = #shuffle_fn::<#i2>(v0, v1);
-                            let v23_upper = #shuffle_fn::<#i2>(v2, v3);
-
-                            // Interleave lower and upper to get final result
-                            let out0 = #shuffle_fn::<#i3>(v01_lower, v23_lower);
-                            let out1 = #shuffle_fn::<#i4>(v01_lower, v23_lower);
-                            let out2 = #shuffle_fn::<#i3>(v01_upper, v23_upper);
-                            let out3 = #shuffle_fn::<#i4>(v01_upper, v23_upper);
-
+                            #shuffle_code
                             #combine_code
                     }
                 }
@@ -704,44 +716,6 @@ impl Level for WasmSimd128 {
                 let elems_per_vec = block_size as usize / vec_ty.scalar_bits;
                 let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
 
-                if vec_ty.scalar_bits == 64 {
-                    let block_ty = vec_ty.block_ty();
-                    let block_ty_2x =
-                        VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 2);
-                    let block_ty_4x =
-                        VecType::new(block_ty.scalar, block_ty.scalar_bits, block_ty.len * 4);
-
-                    let split_method = generic_op_name("split", &block_ty_2x);
-                    let split_method_2x = generic_op_name("split", &block_ty_4x);
-
-                    return quote! {
-                        #method_sig {
-                            let (lower, upper) = self.#split_method_2x(a);
-                            let (v0_vec, v1_vec) = self.#split_method(lower);
-                            let (v2_vec, v3_vec) = self.#split_method(upper);
-
-                            let v0: v128 = v0_vec.into();
-                            let v1: v128 = v1_vec.into();
-                            let v2: v128 = v2_vec.into();
-                            let v3: v128 = v3_vec.into();
-
-                            let out0 = u64x2_shuffle::<0, 2>(v0, v2);
-                            let out1 = u64x2_shuffle::<1, 3>(v0, v2);
-                            let out2 = u64x2_shuffle::<0, 2>(v1, v3);
-                            let out3 = u64x2_shuffle::<1, 3>(v1, v3);
-
-                            let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else {
-                                unreachable!()
-                            };
-
-                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out0, &mut chunks[0]);
-                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out1, &mut chunks[1]);
-                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out2, &mut chunks[2]);
-                            crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out3, &mut chunks[3]);
-                        }
-                    };
-                }
-
                 let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits {
                     8 => (
                         quote! { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 },
@@ -758,6 +732,7 @@ impl Level for WasmSimd128 {
                         quote! { 2, 6, 3, 7 },
                         quote! { u32x4_shuffle },
                     ),
+                    64 => (quote! { 0, 2 }, quote! { 1, 3 }, quote! { u64x2_shuffle }),
                     _ => panic!("unsupported scalar_bits"),
                 };
 
@@ -781,10 +756,15 @@ impl Level for WasmSimd128 {
                     let v3: v128 = v3_vec.into();
                 };
 
-                quote! {
-                    #method_sig {
-                        #split_code
-
+                let shuffle_code = if vec_ty.scalar_bits == 64 {
+                    quote! {
+                        let out0 = #shuffle_fn::<#lower_indices>(v0, v1);
+                        let out1 = #shuffle_fn::<#lower_indices>(v2, v3);
+                        let out2 = #shuffle_fn::<#upper_indices>(v0, v1);
+                        let out3 = #shuffle_fn::<#upper_indices>(v2, v3);
+                    }
+                } else {
+                    quote! {
                         // InterleaveLowerLanes(v0, v2) and InterleaveLowerLanes(v1, v3)
                         let v02_lower = #shuffle_fn::<#lower_indices>(v0, v2);
                         let v13_lower = #shuffle_fn::<#lower_indices>(v1, v3);
@@ -798,6 +778,14 @@ impl Level for WasmSimd128 {
                         let out1 = #shuffle_fn::<#upper_indices>(v02_lower, v13_lower);
                         let out2 = #shuffle_fn::<#lower_indices>(v02_upper, v13_upper);
                         let out3 = #shuffle_fn::<#upper_indices>(v02_upper, v13_upper);
+                    }
+                };
+
+                quote! {
+                    #method_sig {
+                        #split_code
+
+                        #shuffle_code
 
                         let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else {
                             unreachable!()