diff --git a/lib/std/crypto/kangarootwelve.zig b/lib/std/crypto/kangarootwelve.zig index 3b740850af31..08f96a30e538 100644 --- a/lib/std/crypto/kangarootwelve.zig +++ b/lib/std/crypto/kangarootwelve.zig @@ -18,6 +18,9 @@ const optimal_vector_len = std.simd.suggestVectorLength(u64) orelse 1; // Benchmarked optimal value for ReleaseFast mode. const large_file_threshold: usize = 2 * 1024 * 1024; // 2 MB +// Number of chunks each thread processes in parallel mode (default: 8) +const batch_count: usize = 8; + // Round constants for Keccak-p[1600,12] const RC = [12]u64{ 0x000000008000808B, @@ -57,11 +60,11 @@ fn KangarooVariant( const separation_byte_pos = .{ .x = sep_x, .y = sep_y }; const padding_pos = .{ .x = pad_x, .y = pad_y }; - inline fn turboSHAKEToBuffer(view: *const MultiSliceView, separation_byte: u8, output: []u8) void { + inline fn turboShakeToBuffer(view: *const MultiSliceView, separation_byte: u8, output: []u8) void { toBufferFn(view, separation_byte, output); } - inline fn turboSHAKEMultiSliceAlloc( + inline fn turboShakeMultiSliceAlloc( allocator: Allocator, view: *const MultiSliceView, separation_byte: u8, @@ -82,8 +85,8 @@ const KT128Variant = KangarooVariant( 3, // separation_byte_pos.y 0, // padding_pos.x (lane 20: last lane of 168-byte rate) 4, // padding_pos.y - turboSHAKE128MultiSliceToBuffer, - turboSHAKE128MultiSlice, + turboShake128MultiSliceToBuffer, + turboShake128MultiSlice, ); /// KangarooTwelve with 256-bit security parameters @@ -96,8 +99,8 @@ const KT256Variant = KangarooVariant( 0, // separation_byte_pos.y 1, // padding_pos.x (lane 16: last lane of 136-byte rate) 3, // padding_pos.y - turboSHAKE256MultiSliceToBuffer, - turboSHAKE256MultiSlice, + turboShake256MultiSliceToBuffer, + turboShake256MultiSlice, ); /// Rotate left for u64 vector @@ -430,7 +433,7 @@ fn keccakPLanes(lanes: *[25]u64) void { } /// Generic non-allocating TurboSHAKE: write output to provided buffer -fn turboSHAKEMultiSliceToBuffer( +fn turboShakeMultiSliceToBuffer( comptime rate: usize, view: *const MultiSliceView, separation_byte: u8, @@ -471,7 +474,7 @@ fn turboSHAKEMultiSliceToBuffer( } /// Generic allocating TurboSHAKE -fn turboSHAKEMultiSlice( +fn turboShakeMultiSlice( comptime rate: usize, allocator: Allocator, view: *const MultiSliceView, @@ -479,46 +482,46 @@ fn turboSHAKEMultiSlice( output_len: usize, ) ![]u8 { const output = try allocator.alloc(u8, output_len); - turboSHAKEMultiSliceToBuffer(rate, view, separation_byte, output); + turboShakeMultiSliceToBuffer(rate, view, separation_byte, output); return output; } /// Non-allocating TurboSHAKE128: write output to provided buffer -fn turboSHAKE128MultiSliceToBuffer( +fn turboShake128MultiSliceToBuffer( view: *const MultiSliceView, separation_byte: u8, output: []u8, ) void { - turboSHAKEMultiSliceToBuffer(168, view, separation_byte, output); + turboShakeMultiSliceToBuffer(168, view, separation_byte, output); } /// Allocating TurboSHAKE128 -fn turboSHAKE128MultiSlice( +fn turboShake128MultiSlice( allocator: Allocator, view: *const MultiSliceView, separation_byte: u8, output_len: usize, ) ![]u8 { - return turboSHAKEMultiSlice(168, allocator, view, separation_byte, output_len); + return turboShakeMultiSlice(168, allocator, view, separation_byte, output_len); } /// Non-allocating TurboSHAKE256: write output to provided buffer -fn turboSHAKE256MultiSliceToBuffer( +fn turboShake256MultiSliceToBuffer( view: *const MultiSliceView, separation_byte: u8, output: []u8, ) void { - turboSHAKEMultiSliceToBuffer(136, view, separation_byte, output); + turboShakeMultiSliceToBuffer(136, view, separation_byte, output); } /// Allocating TurboSHAKE256 -fn turboSHAKE256MultiSlice( +fn turboShake256MultiSlice( allocator: Allocator, view: *const MultiSliceView, separation_byte: u8, output_len: usize, ) ![]u8 { - return turboSHAKEMultiSlice(136, allocator, view, separation_byte, output_len); + return turboShakeMultiSlice(136, allocator, view, separation_byte, output_len); } /// Process N leaves (8KiB chunks) in parallel - generic version @@ -578,7 +581,7 @@ fn processLeaves( /// Context for processing a batch of leaves in a thread const LeafBatchContext = struct { - output_cvs: []u8, + output_cvs: []align(@alignOf(u64)) u8, batch_start: usize, batch_count: usize, view: *const MultiSliceView, @@ -593,9 +596,11 @@ inline fn processNLeaves( view: *const MultiSliceView, j: usize, leaf_buffer: []u8, - output: []u8, + output: []align(@alignOf(u64)) u8, ) void { const cv_size = Variant.cv_size; + comptime std.debug.assert(cv_size % @sizeOf(u64) == 0); + if (view.tryGetSlice(j, j + N * chunk_size)) |leaf_data| { var leaf_cvs: [N * cv_size]u8 = undefined; processLeaves(Variant, N, leaf_data, &leaf_cvs); @@ -612,7 +617,6 @@ inline fn processNLeaves( fn processLeafBatch(comptime Variant: type, ctx: LeafBatchContext) void { const cv_size = Variant.cv_size; const leaf_buffer = ctx.scratch_buffer[0 .. 8 * chunk_size]; - const cv_scratch = ctx.scratch_buffer[8 * chunk_size .. 8 * chunk_size + cv_size]; var cvs_offset: usize = 0; var j: usize = ctx.batch_start; @@ -621,7 +625,7 @@ fn processLeafBatch(comptime Variant: type, ctx: LeafBatchContext) void { // Process leaves using SIMD (8x, 4x, 2x) based on optimal vector length inline for ([_]usize{ 8, 4, 2 }) |batch_size| { while (optimal_vector_len >= batch_size and j + batch_size * chunk_size <= batch_end) { - processNLeaves(Variant, batch_size, ctx.view, j, leaf_buffer, ctx.output_cvs[cvs_offset..]); + processNLeaves(Variant, batch_size, ctx.view, j, leaf_buffer, @alignCast(ctx.output_cvs[cvs_offset..])); cvs_offset += batch_size * cv_size; j += batch_size * chunk_size; } @@ -632,17 +636,17 @@ fn processLeafBatch(comptime Variant: type, ctx: LeafBatchContext) void { const chunk_len = @min(chunk_size, batch_end - j); if (ctx.view.tryGetSlice(j, j + chunk_len)) |leaf_data| { const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_scratch[0..cv_size]); - @memcpy(ctx.output_cvs[cvs_offset..][0..cv_size], cv_scratch[0..cv_size]); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, ctx.output_cvs[cvs_offset..][0..cv_size]); } else { ctx.view.copyRange(j, j + chunk_len, leaf_buffer[0..chunk_len]); const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_len], &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_scratch[0..cv_size]); - @memcpy(ctx.output_cvs[cvs_offset..][0..cv_size], cv_scratch[0..cv_size]); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, ctx.output_cvs[cvs_offset..][0..cv_size]); } cvs_offset += cv_size; - j += chunk_size; + j += chunk_len; } + + std.debug.assert(cvs_offset == ctx.output_cvs.len); } /// Helper to process N leaves in SIMD and absorb CVs into state @@ -709,12 +713,12 @@ fn ktSingleThreaded(comptime Variant: type, view: *const MultiSliceView, total_l const chunk_len = @min(chunk_size, total_len - j); if (view.tryGetSlice(j, j + chunk_len)) |leaf_data| { const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); final_state.update(cv_buffer[0..cv_size]); // Absorb CV immediately } else { view.copyRange(j, j + chunk_len, leaf_buffer[0..chunk_len]); const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_len], &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); final_state.update(cv_buffer[0..cv_size]); } j += chunk_size; @@ -745,57 +749,45 @@ fn ktMultiThreaded( // Calculate total number of leaves const total_leaves: usize = (total_len - 1) / chunk_size; - // Check if we have enough threads to benefit from parallelization - const thread_count = Thread.getCpuCount() catch 1; - if (thread_count <= 1) { - // Single-threaded fallback - more efficient than using group.async - ktSingleThreaded(Variant, view, total_len, output); - return; - } - // Allocate buffer for all chaining values - const cvs = try allocator.alloc(u8, total_leaves * cv_size); + const cvs = try allocator.alignedAlloc(u8, std.mem.Alignment.of(u64), total_leaves * cv_size); defer allocator.free(cvs); - // Divide work among threads - const leaves_per_thread = (total_leaves + thread_count - 1) / thread_count; + // Calculate number of threads needed based on batch_count + // Each thread processes at most batch_count chunks + const num_threads = std.math.divCeil(usize, total_leaves, batch_count) catch unreachable; - // Pre-allocate scratch buffers for all threads (8 leaves + CV size) - const scratch_size = 8 * chunk_size + cv_size; - const all_scratch = try allocator.alloc(u8, thread_count * scratch_size); + // Pre-allocate scratch buffers for all threads (8 leaves) + const scratch_size = 8 * chunk_size; + const all_scratch = try allocator.alloc(u8, num_threads * scratch_size); defer allocator.free(all_scratch); - const contexts = try allocator.alloc(LeafBatchContext, thread_count); - defer allocator.free(contexts); - + var group: Io.Group = .init; var leaves_assigned: usize = 0; - var context_count: usize = 0; + var thread_index: usize = 0; while (leaves_assigned < total_leaves) { - const batch_count = @min(leaves_per_thread, total_leaves - leaves_assigned); + const leaves_for_this_batch = @min(batch_count, total_leaves - leaves_assigned); const batch_start = chunk_size + leaves_assigned * chunk_size; const cvs_offset = leaves_assigned * cv_size; - contexts[context_count] = LeafBatchContext{ - .output_cvs = cvs[cvs_offset .. cvs_offset + batch_count * cv_size], + const ctx = LeafBatchContext{ + .output_cvs = @alignCast(cvs[cvs_offset .. cvs_offset + leaves_for_this_batch * cv_size]), .batch_start = batch_start, - .batch_count = batch_count, + .batch_count = leaves_for_this_batch, .view = view, - .scratch_buffer = all_scratch[context_count * scratch_size .. (context_count + 1) * scratch_size], + .scratch_buffer = all_scratch[thread_index * scratch_size .. (thread_index + 1) * scratch_size], .total_len = total_len, }; - leaves_assigned += batch_count; - context_count += 1; - } - - var group: Io.Group = .init; - for (contexts[0..context_count]) |ctx| { group.async(io, struct { fn process(c: LeafBatchContext) void { processLeafBatch(Variant, c); } }.process, .{ctx}); + + leaves_assigned += leaves_for_this_batch; + thread_index += 1; } // Wait for all threads to complete @@ -822,7 +814,7 @@ fn ktMultiThreaded( final_node[final_node_len - 1] = 0xFF; const final_view = MultiSliceView.init(final_node, &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&final_view, 0x06, output); + Variant.turboShakeToBuffer(&final_view, 0x06, output); } /// Generic KangarooTwelve hash function builder. @@ -936,7 +928,7 @@ fn KTHash( if (self.pending_count > 0 and self.pending_count < 2) { var cv_buffer: [64]u8 = undefined; const cv_slice = MultiSliceView.init(self.pending_chunks[0..chunk_size], &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); self.final_state.?.update(cv_buffer[0..cv_size]); self.num_leaves += 1; self.pending_count -= 1; @@ -1072,13 +1064,13 @@ fn KTHash( if (remaining_view.tryGetSlice(offset, leaf_end)) |leaf_data| { // Data is contiguous, use it directly const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); } else { // Data spans boundaries, copy to buffer var leaf_buf: [chunk_size]u8 = undefined; remaining_view.copyRange(offset, leaf_end, leaf_buf[0..leaf_size]); const cv_slice = MultiSliceView.init(leaf_buf[0..leaf_size], &[_]u8{}, &[_]u8{}); - Variant.turboSHAKEToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); } self.final_state.?.update(cv_buffer[0..cv_size]); final_leaves += 1; @@ -1162,7 +1154,7 @@ fn KTHash( /// /// Standardized as RFC 9861 after 8 years of public scrutiny. Supports arbitrary-length /// output and optional customization strings for domain separation. -pub const KT128 = KTHash(KT128Variant, turboSHAKE128MultiSliceToBuffer); +pub const KT128 = KTHash(KT128Variant, turboShake128MultiSliceToBuffer); /// KangarooTwelve is a fast, secure cryptographic hash function that uses tree-hashing /// on top of TurboSHAKE. It is built on the Keccak permutation, the same primitive @@ -1176,7 +1168,7 @@ pub const KT128 = KTHash(KT128Variant, turboSHAKE128MultiSliceToBuffer); /// /// Use KT256 when you need extra conservative margins. /// For most applications, KT128 offers better performance with adequate security. -pub const KT256 = KTHash(KT256Variant, turboSHAKE256MultiSliceToBuffer); +pub const KT256 = KTHash(KT256Variant, turboShake256MultiSliceToBuffer); test "KT128 sequential and parallel produce same output for small inputs" { const allocator = std.testing.allocator; @@ -1235,6 +1227,34 @@ test "KT128 sequential and parallel produce same output for large inputs" { } } +test "KT128 sequential and parallel produce same output for many random lengths" { + const allocator = std.testing.allocator; + const io = std.testing.io; + + var prng = std.Random.DefaultPrng.init(std.testing.random_seed); + const random = prng.random(); + + const num_tests = if (builtin.mode == .Debug) 10 else 1000; + const max_length = 250000; + + for (0..num_tests) |_| { + const length = random.intRangeAtMost(usize, 0, max_length); + + const input = try allocator.alloc(u8, length); + defer allocator.free(input); + + random.bytes(input); + + var output_seq: [32]u8 = undefined; + var output_par: [32]u8 = undefined; + + try KT128.hash(input, &output_seq, .{}); + try KT128.hashParallel(input, &output_par, .{}, allocator, io); + + try std.testing.expectEqualSlices(u8, &output_seq, &output_par); + } +} + test "KT128 sequential and parallel produce same output with customization" { const allocator = std.testing.allocator; const io = std.testing.io; @@ -1292,8 +1312,14 @@ test "KT256 sequential and parallel produce same output for large inputs" { const allocator = std.testing.allocator; const io = std.testing.io; - // Test with large input sizes that trigger parallel processing - const test_sizes = [_]usize{ 11 * 1024 * 1024, 20 * 1024 * 1024 }; // 11MB, 20MB + // Test with large input sizes that trigger parallel processing, including + // a size that is just shy of a multiple of the 8KiB chunk to stress the + // final partial leaf. + const test_sizes = [_]usize{ + 11 * 1024 * 1024, // 11MB + 20 * 1024 * 1024, // 20MB + 11 * 1024 * 1024 + 8191, // 11MB + 8191B + }; for (test_sizes) |size| { const input = try allocator.alloc(u8, size);