From 2c81a777737ca607f199d04fd7509c5df15f9147 Mon Sep 17 00:00:00 2001
From: Cathal Mullan <contact@cathal.dev>
Date: Wed, 10 Jun 2026 19:39:10 +0100
Subject: [PATCH] Implement remaining aarch64 SHA-3 LLVM intrinsics

---
 example/neon.rs                | 235 +++++++++++++++++++++++++++++++++
 src/intrinsics/llvm_aarch64.rs |  82 ++++++++++++
 2 files changed, 317 insertions(+)

diff --git a/example/neon.rs b/example/neon.rs
index 6b024de7bb..ba63333daa 100644
--- a/example/neon.rs
+++ b/example/neon.rs
@@ -470,6 +470,220 @@ unsafe fn test_vsha512su1q_u64() {
     assert_eq!(r, e);
 }
 
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_s8() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3s.v16i8
+    let a = i8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let b = i8x16::from([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]);
+    let c = i8x16::from([32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]);
+    let e = i8x16::from([48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]);
+    let r: i8x16 = unsafe { transmute(veor3q_s8(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_s16() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3s.v8i16
+    let a = i16x8::from([0, 1, 2, 3, 4, 5, 6, 7]);
+    let b = i16x8::from([8, 9, 10, 11, 12, 13, 14, 15]);
+    let c = i16x8::from([16, 17, 18, 19, 20, 21, 22, 23]);
+    let e = i16x8::from([24, 25, 26, 27, 28, 29, 30, 31]);
+    let r: i16x8 = unsafe { transmute(veor3q_s16(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_s32() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3s.v4i32
+    let a = i32x4::from([0, 1, 2, 3]);
+    let b = i32x4::from([4, 5, 6, 7]);
+    let c = i32x4::from([8, 9, 10, 11]);
+    let e = i32x4::from([12, 13, 14, 15]);
+    let r: i32x4 = unsafe { transmute(veor3q_s32(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_s64() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3s.v2i64
+    let a = i64x2::from([0, 1]);
+    let b = i64x2::from([2, 3]);
+    let c = i64x2::from([4, 5]);
+    let e = i64x2::from([6, 7]);
+    let r: i64x2 = unsafe { transmute(veor3q_s64(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_u8() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3u.v16i8
+    let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let b = u8x16::from([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]);
+    let c = u8x16::from([32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]);
+    let e = u8x16::from([48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]);
+    let r: u8x16 = unsafe { transmute(veor3q_u8(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_u16() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3u.v8i16
+    let a = u16x8::from([0, 1, 2, 3, 4, 5, 6, 7]);
+    let b = u16x8::from([8, 9, 10, 11, 12, 13, 14, 15]);
+    let c = u16x8::from([16, 17, 18, 19, 20, 21, 22, 23]);
+    let e = u16x8::from([24, 25, 26, 27, 28, 29, 30, 31]);
+    let r: u16x8 = unsafe { transmute(veor3q_u16(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_u32() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3u.v4i32
+    let a = u32x4::from([0, 1, 2, 3]);
+    let b = u32x4::from([4, 5, 6, 7]);
+    let c = u32x4::from([8, 9, 10, 11]);
+    let e = u32x4::from([12, 13, 14, 15]);
+    let r: u32x4 = unsafe { transmute(veor3q_u32(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_veor3q_u64() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.eor3u.v2i64
+    let a = u64x2::from([0, 1]);
+    let b = u64x2::from([2, 3]);
+    let c = u64x2::from([4, 5]);
+    let e = u64x2::from([6, 7]);
+    let r: u64x2 = unsafe { transmute(veor3q_u64(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_s8() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxs.v16i8
+    let a = i8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let b = i8x16::from([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]);
+    let c = i8x16::from([32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]);
+    let e = i8x16::from([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]);
+    let r: i8x16 = unsafe { transmute(vbcaxq_s8(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_s16() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxs.v8i16
+    let a = i16x8::from([0, 1, 2, 3, 4, 5, 6, 7]);
+    let b = i16x8::from([8, 9, 10, 11, 12, 13, 14, 15]);
+    let c = i16x8::from([16, 17, 18, 19, 20, 21, 22, 23]);
+    let e = i16x8::from([8, 9, 10, 11, 12, 13, 14, 15]);
+    let r: i16x8 = unsafe { transmute(vbcaxq_s16(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_s32() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxs.v4i32
+    let a = i32x4::from([0, 1, 2, 3]);
+    let b = i32x4::from([4, 5, 6, 7]);
+    let c = i32x4::from([8, 9, 10, 11]);
+    let e = i32x4::from([4, 5, 6, 7]);
+    let r: i32x4 = unsafe { transmute(vbcaxq_s32(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_s64() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxs.v2i64
+    let a = i64x2::from([0, 1]);
+    let b = i64x2::from([2, 3]);
+    let c = i64x2::from([4, 5]);
+    let e = i64x2::from([2, 3]);
+    let r: i64x2 = unsafe { transmute(vbcaxq_s64(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_u8() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxu.v16i8
+    let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+    let b = u8x16::from([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]);
+    let c = u8x16::from([32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]);
+    let e = u8x16::from([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]);
+    let r: u8x16 = unsafe { transmute(vbcaxq_u8(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_u16() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxu.v8i16
+    let a = u16x8::from([0, 1, 2, 3, 4, 5, 6, 7]);
+    let b = u16x8::from([8, 9, 10, 11, 12, 13, 14, 15]);
+    let c = u16x8::from([16, 17, 18, 19, 20, 21, 22, 23]);
+    let e = u16x8::from([8, 9, 10, 11, 12, 13, 14, 15]);
+    let r: u16x8 = unsafe { transmute(vbcaxq_u16(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_u32() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxu.v4i32
+    let a = u32x4::from([0, 1, 2, 3]);
+    let b = u32x4::from([4, 5, 6, 7]);
+    let c = u32x4::from([8, 9, 10, 11]);
+    let e = u32x4::from([4, 5, 6, 7]);
+    let r: u32x4 = unsafe { transmute(vbcaxq_u32(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vbcaxq_u64() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.bcaxu.v2i64
+    let a = u64x2::from([0, 1]);
+    let b = u64x2::from([2, 3]);
+    let c = u64x2::from([4, 5]);
+    let e = u64x2::from([2, 3]);
+    let r: u64x2 = unsafe { transmute(vbcaxq_u64(transmute(a), transmute(b), transmute(c))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vrax1q_u64() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.rax1
+    let a = u64x2::from([0, 1]);
+    let b = u64x2::from([2, 3]);
+    let e = u64x2::from([4, 7]);
+    let r: u64x2 = unsafe { transmute(vrax1q_u64(transmute(a), transmute(b))) };
+    assert_eq!(r, e);
+}
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "sha3")]
+unsafe fn test_vxarq_u64() {
+    // AArch64 llvm intrinsic: llvm.aarch64.crypto.xar
+    let a = u64x2::from([0, 1]);
+    let b = u64x2::from([2, 3]);
+    let e = u64x2::from([4, 4]);
+    let r: u64x2 = unsafe { transmute(vxarq_u64::<63>(transmute(a), transmute(b))) };
+    assert_eq!(r, e);
+}
+
 #[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "aes")]
 fn test_vmull_p64() {
@@ -698,6 +912,27 @@ fn main() {
             test_vsha512h2q_u64();
             test_vsha512su0q_u64();
             test_vsha512su1q_u64();
+
+            test_veor3q_s8();
+            test_veor3q_s16();
+            test_veor3q_s32();
+            test_veor3q_s64();
+            test_veor3q_u8();
+            test_veor3q_u16();
+            test_veor3q_u32();
+            test_veor3q_u64();
+
+            test_vbcaxq_s8();
+            test_vbcaxq_s16();
+            test_vbcaxq_s32();
+            test_vbcaxq_s64();
+            test_vbcaxq_u8();
+            test_vbcaxq_u16();
+            test_vbcaxq_u32();
+            test_vbcaxq_u64();
+
+            test_vrax1q_u64();
+            test_vxarq_u64();
         }
 
         test_vmull_p64();
diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs
index c322859fec..147bfafba4 100644
--- a/src/intrinsics/llvm_aarch64.rs
+++ b/src/intrinsics/llvm_aarch64.rs
@@ -977,6 +977,88 @@ pub(super) fn codegen_aarch64_llvm_intrinsic_call<'tcx>(
             );
         }
 
+        "llvm.aarch64.crypto.eor3s.v2i64"
+        | "llvm.aarch64.crypto.eor3s.v4i32"
+        | "llvm.aarch64.crypto.eor3s.v8i16"
+        | "llvm.aarch64.crypto.eor3s.v16i8"
+        | "llvm.aarch64.crypto.eor3u.v2i64"
+        | "llvm.aarch64.crypto.eor3u.v4i32"
+        | "llvm.aarch64.crypto.eor3u.v8i16"
+        | "llvm.aarch64.crypto.eor3u.v16i8" => {
+            // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/EOR3--Three-way-exclusive-OR-
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
+
+            simd_trio_for_each_lane(
+                fx,
+                a,
+                b,
+                c,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane, c_lane| {
+                    let xor = fx.bcx.ins().bxor(a_lane, b_lane);
+                    fx.bcx.ins().bxor(xor, c_lane)
+                },
+            );
+        }
+
+        "llvm.aarch64.crypto.bcaxs.v2i64"
+        | "llvm.aarch64.crypto.bcaxs.v4i32"
+        | "llvm.aarch64.crypto.bcaxs.v8i16"
+        | "llvm.aarch64.crypto.bcaxs.v16i8"
+        | "llvm.aarch64.crypto.bcaxu.v2i64"
+        | "llvm.aarch64.crypto.bcaxu.v4i32"
+        | "llvm.aarch64.crypto.bcaxu.v8i16"
+        | "llvm.aarch64.crypto.bcaxu.v16i8" => {
+            // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/BCAX--Bit-clear-and-exclusive-OR-
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
+
+            simd_trio_for_each_lane(
+                fx,
+                a,
+                b,
+                c,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane, c_lane| {
+                    let band_not = fx.bcx.ins().band_not(b_lane, c_lane);
+                    fx.bcx.ins().bxor(a_lane, band_not)
+                },
+            );
+        }
+
+        "llvm.aarch64.crypto.rax1" => {
+            // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/RAX1--Rotate-and-exclusive-OR-
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+
+            simd_pair_for_each_lane(
+                fx,
+                a,
+                b,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| {
+                    let rot = fx.bcx.ins().rotl_imm(b_lane, 1);
+                    fx.bcx.ins().bxor(a_lane, rot)
+                },
+            );
+        }
+
+        "llvm.aarch64.crypto.xar" => {
+            // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/XAR--Exclusive-OR-and-rotate-
+            intrinsic_args!(fx, args => (a, b, c); intrinsic);
+
+            let c = c.load_scalar(fx);
+
+            simd_pair_for_each_lane(
+                fx,
+                a,
+                b,
+                ret,
+                &|fx, _lane_ty, _res_lane_ty, a_lane, b_lane| {
+                    let xor = fx.bcx.ins().bxor(a_lane, b_lane);
+                    fx.bcx.ins().rotr(xor, c)
+                },
+            );
+        }
+
         "llvm.aarch64.neon.pmull64" => {
             intrinsic_args!(fx, args => (a, b); intrinsic);